Files
TinyTorch/modules/18_acceleration_ABOUT.html
2025-12-05 00:52:38 +00:00

1230 lines
93 KiB
HTML
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
<!DOCTYPE html>
<html lang="en" data-content_root="../" >
<head>
<meta charset="utf-8" />
<meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
<title>18. Acceleration - CPU Vectorization &amp; Cache Optimization &#8212; Tiny🔥Torch</title>
<script data-cfasync="false">
document.documentElement.dataset.mode = localStorage.getItem("mode") || "";
document.documentElement.dataset.theme = localStorage.getItem("theme") || "";
</script>
<!-- Loaded before other Sphinx assets -->
<link href="../_static/styles/theme.css?digest=dfe6caa3a7d634c4db9b" rel="stylesheet" />
<link href="../_static/styles/bootstrap.css?digest=dfe6caa3a7d634c4db9b" rel="stylesheet" />
<link href="../_static/styles/pydata-sphinx-theme.css?digest=dfe6caa3a7d634c4db9b" rel="stylesheet" />
<link href="../_static/vendor/fontawesome/6.5.2/css/all.min.css?digest=dfe6caa3a7d634c4db9b" rel="stylesheet" />
<link rel="preload" as="font" type="font/woff2" crossorigin href="../_static/vendor/fontawesome/6.5.2/webfonts/fa-solid-900.woff2" />
<link rel="preload" as="font" type="font/woff2" crossorigin href="../_static/vendor/fontawesome/6.5.2/webfonts/fa-brands-400.woff2" />
<link rel="preload" as="font" type="font/woff2" crossorigin href="../_static/vendor/fontawesome/6.5.2/webfonts/fa-regular-400.woff2" />
<link rel="stylesheet" type="text/css" href="../_static/pygments.css?v=03e43079" />
<link rel="stylesheet" type="text/css" href="../_static/styles/sphinx-book-theme.css?v=eba8b062" />
<link rel="stylesheet" type="text/css" href="../_static/togglebutton.css?v=13237357" />
<link rel="stylesheet" type="text/css" href="../_static/copybutton.css?v=76b2166b" />
<link rel="stylesheet" type="text/css" href="../_static/mystnb.8ecb98da25f57f5357bf6f572d296f466b2cfe2517ffebfabe82451661e28f02.css" />
<link rel="stylesheet" type="text/css" href="../_static/sphinx-thebe.css?v=4fa983c6" />
<link rel="stylesheet" type="text/css" href="../_static/sphinx-design.min.css?v=95c83b7e" />
<link rel="stylesheet" type="text/css" href="../_static/custom.css?v=009d37f4" />
<!-- Pre-loaded scripts that we'll load fully later -->
<link rel="preload" as="script" href="../_static/scripts/bootstrap.js?digest=dfe6caa3a7d634c4db9b" />
<link rel="preload" as="script" href="../_static/scripts/pydata-sphinx-theme.js?digest=dfe6caa3a7d634c4db9b" />
<script src="../_static/vendor/fontawesome/6.5.2/js/all.min.js?digest=dfe6caa3a7d634c4db9b"></script>
<script src="../_static/documentation_options.js?v=9eb32ce0"></script>
<script src="../_static/doctools.js?v=9a2dae69"></script>
<script src="../_static/sphinx_highlight.js?v=dc90522c"></script>
<script src="../_static/clipboard.min.js?v=a7894cd8"></script>
<script src="../_static/copybutton.js?v=f281be69"></script>
<script src="../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
<script>let toggleHintShow = 'Click to show';</script>
<script>let toggleHintHide = 'Click to hide';</script>
<script>let toggleOpenOnPrint = 'true';</script>
<script src="../_static/togglebutton.js?v=4a39c7ea"></script>
<script>var togglebuttonSelector = '.toggle, .admonition.dropdown';</script>
<script src="../_static/design-tabs.js?v=f930bc37"></script>
<script>const THEBE_JS_URL = "https://unpkg.com/thebe@0.8.2/lib/index.js"; const thebe_selector = ".thebe,.cell"; const thebe_selector_input = "pre"; const thebe_selector_output = ".output, .cell_output"</script>
<script async="async" src="../_static/sphinx-thebe.js?v=c100c467"></script>
<script>var togglebuttonSelector = '.toggle, .admonition.dropdown';</script>
<script>const THEBE_JS_URL = "https://unpkg.com/thebe@0.8.2/lib/index.js"; const thebe_selector = ".thebe,.cell"; const thebe_selector_input = "pre"; const thebe_selector_output = ".output, .cell_output"</script>
<script>DOCUMENTATION_OPTIONS.pagename = 'modules/18_acceleration_ABOUT';</script>
<script src="../_static/ml-timeline.js?v=76e9b3e3"></script>
<script src="../_static/wip-banner.js?v=04a7e74d"></script>
<script src="../_static/marimo-badges.js?v=e6289128"></script>
<script src="../_static/sidebar-link.js?v=404b701b"></script>
<script src="../_static/hero-carousel.js?v=10341d2a"></script>
<script src="../_static/subscribe-modal.js?v=42919b64"></script>
<link rel="icon" href="../_static/favicon.svg"/>
<link rel="index" title="Index" href="../genindex.html" />
<link rel="search" title="Search" href="../search.html" />
<link rel="next" title="19. Benchmarking - Fair Performance Comparison" href="19_benchmarking_ABOUT.html" />
<link rel="prev" title="17. Memoization - Computational Reuse for Inference" href="17_memoization_ABOUT.html" />
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<meta name="docsearch:language" content="en"/>
</head>
<body data-bs-spy="scroll" data-bs-target=".bd-toc-nav" data-offset="180" data-bs-root-margin="0px 0px -60%" data-default-mode="">
<div id="pst-skip-link" class="skip-link d-print-none"><a href="#main-content">Skip to main content</a></div>
<div id="pst-scroll-pixel-helper"></div>
<button type="button" class="btn rounded-pill" id="pst-back-to-top">
<i class="fa-solid fa-arrow-up"></i>Back to top</button>
<input type="checkbox"
class="sidebar-toggle"
id="pst-primary-sidebar-checkbox"/>
<label class="overlay overlay-primary" for="pst-primary-sidebar-checkbox"></label>
<input type="checkbox"
class="sidebar-toggle"
id="pst-secondary-sidebar-checkbox"/>
<label class="overlay overlay-secondary" for="pst-secondary-sidebar-checkbox"></label>
<div class="search-button__wrapper">
<div class="search-button__overlay"></div>
<div class="search-button__search-container">
<form class="bd-search d-flex align-items-center"
action="../search.html"
method="get">
<i class="fa-solid fa-magnifying-glass"></i>
<input type="search"
class="form-control"
name="q"
id="search-input"
placeholder="Search..."
aria-label="Search..."
autocomplete="off"
autocorrect="off"
autocapitalize="off"
spellcheck="false"/>
<span class="search-button__kbd-shortcut"><kbd class="kbd-shortcut__modifier">Ctrl</kbd>+<kbd>K</kbd></span>
</form></div>
</div>
<div class="pst-async-banner-revealer d-none">
<aside id="bd-header-version-warning" class="d-none d-print-none" aria-label="Version warning"></aside>
</div>
<header class="bd-header navbar navbar-expand-lg bd-navbar d-print-none">
</header>
<div class="bd-container">
<div class="bd-container__inner bd-page-width">
<div class="bd-sidebar-primary bd-sidebar">
<div class="sidebar-header-items sidebar-primary__section">
</div>
<div class="sidebar-primary-items__start sidebar-primary__section">
<div class="sidebar-primary-item">
<a class="navbar-brand logo" href="../intro.html">
<img src="../_static/logo-tinytorch.png" class="logo__image only-light" alt="Tiny🔥Torch - Home"/>
<script>document.write(`<img src="../_static/logo-tinytorch.png" class="logo__image only-dark" alt="Tiny🔥Torch - Home"/>`);</script>
</a></div>
<div class="sidebar-primary-item">
<script>
document.write(`
<button class="btn search-button-field search-button__button" title="Search" aria-label="Search" data-bs-placement="bottom" data-bs-toggle="tooltip">
<i class="fa-solid fa-magnifying-glass"></i>
<span class="search-button__default-text">Search</span>
<span class="search-button__kbd-shortcut"><kbd class="kbd-shortcut__modifier">Ctrl</kbd>+<kbd class="kbd-shortcut__modifier">K</kbd></span>
</button>
`);
</script></div>
<div class="sidebar-primary-item"><nav class="bd-links bd-docs-nav" aria-label="Main">
<div class="bd-toc-item navbar-nav active">
<p aria-level="2" class="caption" role="heading"><span class="caption-text">🚀 Getting Started</span></p>
<ul class="nav bd-sidenav">
<li class="toctree-l1"><a class="reference internal" href="../getting-started.html">Complete Guide</a></li>
</ul>
<p aria-level="2" class="caption" role="heading"><span class="caption-text">🏗 Foundation Tier (01-07)</span></p>
<ul class="nav bd-sidenav">
<li class="toctree-l1"><a class="reference internal" href="../tiers/foundation.html">📖 Tier Overview</a></li>
<li class="toctree-l1"><a class="reference internal" href="01_tensor_ABOUT.html">01. Tensor</a></li>
<li class="toctree-l1"><a class="reference internal" href="02_activations_ABOUT.html">02. Activations</a></li>
<li class="toctree-l1"><a class="reference internal" href="03_layers_ABOUT.html">03. Layers</a></li>
<li class="toctree-l1"><a class="reference internal" href="04_losses_ABOUT.html">04. Losses</a></li>
<li class="toctree-l1"><a class="reference internal" href="05_autograd_ABOUT.html">05. Autograd</a></li>
<li class="toctree-l1"><a class="reference internal" href="06_optimizers_ABOUT.html">06. Optimizers</a></li>
<li class="toctree-l1"><a class="reference internal" href="07_training_ABOUT.html">07. Training</a></li>
</ul>
<p aria-level="2" class="caption" role="heading"><span class="caption-text">🏛️ Architecture Tier (08-13)</span></p>
<ul class="nav bd-sidenav">
<li class="toctree-l1"><a class="reference internal" href="../tiers/architecture.html">📖 Tier Overview</a></li>
<li class="toctree-l1"><a class="reference internal" href="08_dataloader_ABOUT.html">08. DataLoader</a></li>
<li class="toctree-l1"><a class="reference internal" href="09_spatial_ABOUT.html">09. Convolutions</a></li>
<li class="toctree-l1"><a class="reference internal" href="10_tokenization_ABOUT.html">10. Tokenization</a></li>
<li class="toctree-l1"><a class="reference internal" href="11_embeddings_ABOUT.html">11. Embeddings</a></li>
<li class="toctree-l1"><a class="reference internal" href="12_attention_ABOUT.html">12. Attention</a></li>
<li class="toctree-l1"><a class="reference internal" href="13_transformers_ABOUT.html">13. Transformers</a></li>
</ul>
<p aria-level="2" class="caption" role="heading"><span class="caption-text">⏱️ Optimization Tier (14-19)</span></p>
<ul class="current nav bd-sidenav">
<li class="toctree-l1"><a class="reference internal" href="../tiers/optimization.html">📖 Tier Overview</a></li>
<li class="toctree-l1"><a class="reference internal" href="14_profiling_ABOUT.html">14. Profiling</a></li>
<li class="toctree-l1"><a class="reference internal" href="15_quantization_ABOUT.html">15. Quantization</a></li>
<li class="toctree-l1"><a class="reference internal" href="16_compression_ABOUT.html">16. Compression</a></li>
<li class="toctree-l1"><a class="reference internal" href="17_memoization_ABOUT.html">17. Memoization</a></li>
<li class="toctree-l1 current active"><a class="current reference internal" href="#">18. Acceleration</a></li>
<li class="toctree-l1"><a class="reference internal" href="19_benchmarking_ABOUT.html">19. Benchmarking</a></li>
</ul>
<p aria-level="2" class="caption" role="heading"><span class="caption-text">🏅 Capstone Competition</span></p>
<ul class="nav bd-sidenav">
<li class="toctree-l1"><a class="reference internal" href="../tiers/olympics.html">📖 Competition Overview</a></li>
<li class="toctree-l1"><a class="reference internal" href="20_capstone_ABOUT.html">20. Torch Olympics</a></li>
</ul>
<p aria-level="2" class="caption" role="heading"><span class="caption-text">🧭 Course Orientation</span></p>
<ul class="nav bd-sidenav">
<li class="toctree-l1"><a class="reference internal" href="../chapters/00-introduction.html">Course Structure</a></li>
<li class="toctree-l1"><a class="reference internal" href="../prerequisites.html">Prerequisites &amp; Resources</a></li>
<li class="toctree-l1"><a class="reference internal" href="../chapters/learning-journey.html">Learning Journey</a></li>
<li class="toctree-l1"><a class="reference internal" href="../chapters/milestones.html">Historical Milestones</a></li>
<li class="toctree-l1"><a class="reference internal" href="../faq.html">FAQ</a></li>
</ul>
<p aria-level="2" class="caption" role="heading"><span class="caption-text">🛠️ TITO CLI Reference</span></p>
<ul class="nav bd-sidenav">
<li class="toctree-l1"><a class="reference internal" href="../tito/overview.html">Command Overview</a></li>
<li class="toctree-l1"><a class="reference internal" href="../tito/modules.html">Module Workflow</a></li>
<li class="toctree-l1"><a class="reference internal" href="../tito/milestones.html">Milestone System</a></li>
<li class="toctree-l1"><a class="reference internal" href="../tito/data.html">Progress &amp; Data</a></li>
<li class="toctree-l1"><a class="reference internal" href="../tito/troubleshooting.html">Troubleshooting</a></li>
<li class="toctree-l1"><a class="reference internal" href="../datasets.html">Datasets Guide</a></li>
</ul>
<p aria-level="2" class="caption" role="heading"><span class="caption-text">🤝 Community</span></p>
<ul class="nav bd-sidenav">
<li class="toctree-l1"><a class="reference internal" href="../community.html">Ecosystem</a></li>
<li class="toctree-l1"><a class="reference internal" href="../resources.html">Learning Resources</a></li>
<li class="toctree-l1"><a class="reference internal" href="../credits.html">Credits &amp; Acknowledgments</a></li>
</ul>
</div>
</nav></div>
</div>
<div class="sidebar-primary-items__end sidebar-primary__section">
</div>
<div id="rtd-footer-container"></div>
</div>
<main id="main-content" class="bd-main" role="main">
<div class="sbt-scroll-pixel-helper"></div>
<div class="bd-content">
<div class="bd-article-container">
<div class="bd-header-article d-print-none">
<div class="header-article-items header-article__inner">
<div class="header-article-items__start">
<div class="header-article-item"><button class="sidebar-toggle primary-toggle btn btn-sm" title="Toggle primary sidebar" data-bs-placement="bottom" data-bs-toggle="tooltip">
<span class="fa-solid fa-bars"></span>
</button></div>
</div>
<div class="header-article-items__end">
<div class="header-article-item">
<div class="article-header-buttons">
<div class="dropdown dropdown-download-buttons">
<button class="btn dropdown-toggle" type="button" data-bs-toggle="dropdown" aria-expanded="false" aria-label="Download this page">
<i class="fas fa-download"></i>
</button>
<ul class="dropdown-menu">
<li><a href="../_sources/modules/18_acceleration_ABOUT.md" target="_blank"
class="btn btn-sm btn-download-source-button dropdown-item"
title="Download source file"
data-bs-placement="left" data-bs-toggle="tooltip"
>
<span class="btn__icon-container">
<i class="fas fa-file"></i>
</span>
<span class="btn__text-container">.md</span>
</a>
</li>
<li>
<button onclick="window.print()"
class="btn btn-sm btn-download-pdf-button dropdown-item"
title="Print to PDF"
data-bs-placement="left" data-bs-toggle="tooltip"
>
<span class="btn__icon-container">
<i class="fas fa-file-pdf"></i>
</span>
<span class="btn__text-container">.pdf</span>
</button>
</li>
</ul>
</div>
<button onclick="toggleFullScreen()"
class="btn btn-sm btn-fullscreen-button"
title="Fullscreen mode"
data-bs-placement="bottom" data-bs-toggle="tooltip"
>
<span class="btn__icon-container">
<i class="fas fa-expand"></i>
</span>
</button>
<script>
document.write(`
<button class="btn btn-sm nav-link pst-navbar-icon theme-switch-button" title="light/dark" aria-label="light/dark" data-bs-placement="bottom" data-bs-toggle="tooltip">
<i class="theme-switch fa-solid fa-sun fa-lg" data-mode="light"></i>
<i class="theme-switch fa-solid fa-moon fa-lg" data-mode="dark"></i>
<i class="theme-switch fa-solid fa-circle-half-stroke fa-lg" data-mode="auto"></i>
</button>
`);
</script>
<script>
document.write(`
<button class="btn btn-sm pst-navbar-icon search-button search-button__button" title="Search" aria-label="Search" data-bs-placement="bottom" data-bs-toggle="tooltip">
<i class="fa-solid fa-magnifying-glass fa-lg"></i>
</button>
`);
</script>
<button class="sidebar-toggle secondary-toggle btn btn-sm" title="Toggle secondary sidebar" data-bs-placement="bottom" data-bs-toggle="tooltip">
<span class="fa-solid fa-list"></span>
</button>
</div></div>
</div>
</div>
</div>
<div id="jb-print-docs-body" class="onlyprint">
<h1>18. Acceleration - CPU Vectorization & Cache Optimization</h1>
<!-- Table of contents -->
<div id="print-main-content">
<div id="jb-print-toc">
<div>
<h2> Contents </h2>
</div>
<nav aria-label="Page">
<ul class="visible nav section-nav flex-column">
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#overview">Overview</a></li>
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#learning-objectives">Learning Objectives</a></li>
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#build-use-optimize">Build → Use → Optimize</a></li>
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#why-this-matters-the-hardware-reality">Why This Matters: The Hardware Reality</a><ul class="nav section-nav flex-column">
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#the-performance-gap">The Performance Gap</a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#from-naive-python-to-production-performance">From Naive Python to Production Performance</a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#real-world-impact">Real-World Impact</a></li>
</ul>
</li>
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#the-roofline-model-your-performance-compass">The Roofline Model: Your Performance Compass</a><ul class="nav section-nav flex-column">
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#understanding-hardware-limits">Understanding Hardware Limits</a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#example-calculations">Example Calculations</a></li>
</ul>
</li>
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#implementation-guide">Implementation Guide</a><ul class="nav section-nav flex-column">
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#vectorized-matrix-multiplication">1. Vectorized Matrix Multiplication</a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#kernel-fusion-eliminating-memory-traffic">2. Kernel Fusion: Eliminating Memory Traffic</a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#cache-aware-tiling-blocked-algorithms">3. Cache-Aware Tiling (Blocked Algorithms)</a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#roofline-analysis-in-practice">4. Roofline Analysis in Practice</a></li>
</ul>
</li>
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#getting-started">Getting Started</a><ul class="nav section-nav flex-column">
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#prerequisites">Prerequisites</a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#development-workflow">Development Workflow</a></li>
</ul>
</li>
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#testing">Testing</a><ul class="nav section-nav flex-column">
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#comprehensive-test-suite">Comprehensive Test Suite</a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#test-coverage-areas">Test Coverage Areas</a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#inline-testing-performance-analysis">Inline Testing &amp; Performance Analysis</a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#manual-testing-examples">Manual Testing Examples</a></li>
</ul>
</li>
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#systems-thinking-questions">Systems Thinking Questions</a><ul class="nav section-nav flex-column">
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#real-world-applications">Real-World Applications</a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#roofline-analysis-foundations">Roofline Analysis Foundations</a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#optimization-strategy-characteristics">Optimization Strategy Characteristics</a></li>
</ul>
</li>
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#ready-to-build">Ready to Build?</a></li>
</ul>
</nav>
</div>
</div>
</div>
<div id="searchbox"></div>
<article class="bd-article">
<section id="acceleration-cpu-vectorization-cache-optimization">
<h1>18. Acceleration - CPU Vectorization &amp; Cache Optimization<a class="headerlink" href="#acceleration-cpu-vectorization-cache-optimization" title="Link to this heading">#</a></h1>
<p><strong>OPTIMIZATION TIER</strong> | Difficulty: ⭐⭐⭐ (3/4) | Time: 6-8 hours</p>
<section id="overview">
<h2>Overview<a class="headerlink" href="#overview" title="Link to this heading">#</a></h2>
<p>The Acceleration module teaches you to extract maximum performance from modern CPUs through hardware-aware optimization techniques. Youll learn to leverage optimized BLAS libraries for vectorized matrix operations, implement cache-friendly algorithms that exploit memory hierarchy, and apply kernel fusion to eliminate memory bandwidth bottlenecks. By mastering the roofline model and arithmetic intensity analysis, youll develop the systematic thinking needed to accelerate real ML systems from research prototypes to production deployments.</p>
<p>This is CPU-focused acceleration—the foundation for understanding GPU perf. Youll work with NumPys BLAS backend (MKL, OpenBLAS) to achieve 10-100x speedups over naive Python, understand why most operations are memory-bound rather than compute-bound, and learn the measurement-driven optimization workflow used by PyTorch, TensorFlow, and production ML systems.</p>
</section>
<section id="learning-objectives">
<h2>Learning Objectives<a class="headerlink" href="#learning-objectives" title="Link to this heading">#</a></h2>
<p>By the end of this module, you will be able to:</p>
<ul class="simple">
<li><p><strong>Understand Hardware Bottlenecks</strong>: Apply the roofline model to determine whether operations are compute-bound or memory-bound, and predict performance limits from hardware specifications</p></li>
<li><p><strong>Leverage BLAS Vectorization</strong>: Use optimized linear algebra libraries that exploit SIMD instructions and multi-threading to achieve 10-100x speedups over naive implementations</p></li>
<li><p><strong>Implement Cache-Aware Algorithms</strong>: Design blocked/tiled algorithms that maximize cache hit rates by fitting working sets into L1/L2 cache for 2-5x memory performance gains</p></li>
<li><p><strong>Apply Kernel Fusion</strong>: Reduce memory bandwidth usage by 60-80% through fusing element-wise operations into single expressions that eliminate intermediate array allocations</p></li>
<li><p><strong>Measure Systematically</strong>: Integrate with Module 14 profiling to validate optimization impact, measure FLOPs efficiency, and calculate arithmetic intensity for real workloads</p></li>
</ul>
</section>
<section id="build-use-optimize">
<h2>Build → Use → Optimize<a class="headerlink" href="#build-use-optimize" title="Link to this heading">#</a></h2>
<p>This module follows TinyTorchs <strong>Build → Use → Optimize</strong> framework:</p>
<ol class="arabic simple">
<li><p><strong>Build</strong>: Implement vectorized matrix multiplication using BLAS, fused GELU activation, and tiled algorithms for cache efficiency</p></li>
<li><p><strong>Use</strong>: Apply acceleration to realistic transformer blocks, analyze memory access patterns, and measure performance across different tensor sizes</p></li>
<li><p><strong>Optimize</strong>: Analyze roofline characteristics, measure arithmetic intensity, and develop systematic decision frameworks for production optimization strategies</p></li>
</ol>
</section>
<section id="why-this-matters-the-hardware-reality">
<h2>Why This Matters: The Hardware Reality<a class="headerlink" href="#why-this-matters-the-hardware-reality" title="Link to this heading">#</a></h2>
<section id="the-performance-gap">
<h3>The Performance Gap<a class="headerlink" href="#the-performance-gap" title="Link to this heading">#</a></h3>
<p>Modern ML workloads face a fundamental challenge: <strong>the speed gap between computation and memory access grows every year</strong>. Consider a typical CPU:</p>
<ul class="simple">
<li><p><strong>Peak Compute</strong>: 200-500 GFLOP/s (billions of floating-point operations per second)</p></li>
<li><p><strong>Memory Bandwidth</strong>: 20-50 GB/s (data transfer rate from RAM to CPU)</p></li>
<li><p><strong>Imbalance</strong>: CPUs can perform 10-20 floating-point operations in the time it takes to fetch a single float from memory</p></li>
</ul>
<p>This means <strong>most ML operations are memory-bound, not compute-bound</strong>. Naive implementations waste computation cycles waiting for data. Professional optimization is about feeding the compute units efficiently.</p>
</section>
<section id="from-naive-python-to-production-performance">
<h3>From Naive Python to Production Performance<a class="headerlink" href="#from-naive-python-to-production-performance" title="Link to this heading">#</a></h3>
<p>The performance hierarchy for ML operations:</p>
<div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="n">Naive</span> <span class="n">Python</span> <span class="n">loops</span><span class="p">:</span> <span class="mi">1</span> <span class="n">GFLOP</span><span class="o">/</span><span class="n">s</span> <span class="p">(</span><span class="n">baseline</span><span class="p">)</span>
<span class="n">NumPy</span> <span class="p">(</span><span class="n">vectorized</span><span class="p">):</span> <span class="mi">10</span><span class="o">-</span><span class="mi">50</span> <span class="n">GFLOP</span><span class="o">/</span><span class="n">s</span> <span class="p">(</span><span class="mi">10</span><span class="o">-</span><span class="mi">50</span><span class="n">x</span> <span class="n">faster</span><span class="p">)</span>
<span class="n">Optimized</span> <span class="n">BLAS</span> <span class="p">(</span><span class="n">this</span> <span class="n">module</span><span class="p">):</span> <span class="mi">100</span><span class="o">-</span><span class="mi">500</span> <span class="n">GFLOP</span><span class="o">/</span><span class="n">s</span> <span class="p">(</span><span class="mi">100</span><span class="o">-</span><span class="mi">500</span><span class="n">x</span> <span class="n">faster</span><span class="p">)</span>
<span class="n">GPU</span> <span class="n">CUDA</span> <span class="n">kernels</span><span class="p">:</span> <span class="mi">1</span><span class="p">,</span><span class="mi">000</span><span class="o">-</span><span class="mi">10</span><span class="p">,</span><span class="mi">000</span> <span class="n">GFLOP</span><span class="o">/</span><span class="n">s</span> <span class="p">(</span><span class="mi">1</span><span class="p">,</span><span class="mi">000</span><span class="o">-</span><span class="mi">10</span><span class="p">,</span><span class="mi">000</span><span class="n">x</span> <span class="n">faster</span><span class="p">)</span>
</pre></div>
</div>
<p>This module focuses on the <strong>100-500x speedup</strong> achievable on CPUs through:</p>
<ul class="simple">
<li><p><strong>SIMD vectorization</strong>: Process 4-8 floats per instruction (AVX2/AVX-512)</p></li>
<li><p><strong>Multi-threading</strong>: Use all CPU cores (4-8x parallelism)</p></li>
<li><p><strong>Cache blocking</strong>: Keep data in fast cache memory (10-100x faster than RAM)</p></li>
<li><p><strong>Kernel fusion</strong>: Reduce memory traffic by 4-10x</p></li>
</ul>
</section>
<section id="real-world-impact">
<h3>Real-World Impact<a class="headerlink" href="#real-world-impact" title="Link to this heading">#</a></h3>
<p>These techniques enable:</p>
<ul class="simple">
<li><p><strong>Faster iteration</strong>: Train models in hours instead of days during research</p></li>
<li><p><strong>Lower costs</strong>: More efficient use of cloud compute resources</p></li>
<li><p><strong>Edge deployment</strong>: Run models on CPUs without GPU requirements</p></li>
<li><p><strong>Better scaling</strong>: Handle larger models and batch sizes within memory limits</p></li>
</ul>
<p>Understanding CPU optimization is prerequisite for GPU programming—same principles, different scale.</p>
</section>
</section>
<section id="the-roofline-model-your-performance-compass">
<h2>The Roofline Model: Your Performance Compass<a class="headerlink" href="#the-roofline-model-your-performance-compass" title="Link to this heading">#</a></h2>
<section id="understanding-hardware-limits">
<h3>Understanding Hardware Limits<a class="headerlink" href="#understanding-hardware-limits" title="Link to this heading">#</a></h3>
<p>The <strong>roofline model</strong> is the fundamental tool for understanding performance bottlenecks. It plots two hardware limits:</p>
<ol class="arabic simple">
<li><p><strong>Compute Roof</strong>: Maximum FLOPs the processor can execute per second</p></li>
<li><p><strong>Memory Roof</strong>: Maximum data bandwidth × arithmetic intensity</p></li>
</ol>
<p><strong>Arithmetic Intensity (AI)</strong> = FLOPs performed / Bytes accessed</p>
<div class="highlight-default notranslate"><div class="highlight"><pre><span></span>Performance Compute Bound Region
(GFLOPS) ┌─────────────────────
│ Peak Compute (500 GFLOP/s)
╱│
│ Memory Bound Region
╱────│──────────────────────
╱───────│──────────────────── Arithmetic Intensity
│ (FLOPs/Byte)
Low│ High
(&lt;1)│ (&gt;10)
</pre></div>
</div>
<p><strong>Key Insight</strong>: If your operation falls below the roofline (left side), adding more compute wont help—you need to reduce memory traffic through algorithmic improvements.</p>
</section>
<section id="example-calculations">
<h3>Example Calculations<a class="headerlink" href="#example-calculations" title="Link to this heading">#</a></h3>
<p><strong>Element-wise addition</strong>: <code class="docutils literal notranslate"><span class="pre">c</span> <span class="pre">=</span> <span class="pre">a</span> <span class="pre">+</span> <span class="pre">b</span></code></p>
<ul class="simple">
<li><p>FLOPs: N (one addition per element)</p></li>
<li><p>Bytes: 3N × 4 bytes (read a, read b, write c)</p></li>
<li><p>AI = N / (12N) = <strong>0.08 FLOPs/byte</strong> → Severely memory-bound</p></li>
</ul>
<p><strong>Matrix multiplication</strong>: <code class="docutils literal notranslate"><span class="pre">C</span> <span class="pre">=</span> <span class="pre">A</span> <span class="pre">&#64;</span> <span class="pre">B</span></code> for N×N matrices</p>
<ul class="simple">
<li><p>FLOPs: 2N³ (dot product for each of N² output elements)</p></li>
<li><p>Bytes: 3N² × 4 bytes (read A, read B, write C)</p></li>
<li><p>AI = 2N³ / (12N²) = <strong>N/6 FLOPs/byte</strong> → Compute-bound for large N</p></li>
</ul>
<p>For N=1024: AI = 171 FLOPs/byte—squarely in the compute-bound region. This is why matrix multiplication is ideal for GPUs and why transformers (which are mostly matmuls) run efficiently on accelerators.</p>
</section>
</section>
<section id="implementation-guide">
<h2>Implementation Guide<a class="headerlink" href="#implementation-guide" title="Link to this heading">#</a></h2>
<section id="vectorized-matrix-multiplication">
<h3>1. Vectorized Matrix Multiplication<a class="headerlink" href="#vectorized-matrix-multiplication" title="Link to this heading">#</a></h3>
<p><strong>The Challenge</strong>: Naive triple-nested loops in Python achieve ~1 GFLOP/s. We need 100-500 GFLOP/s.</p>
<p><strong>The Solution</strong>: Leverage optimized BLAS (Basic Linear Algebra Subprograms) libraries that implement:</p>
<ul class="simple">
<li><p><strong>SIMD vectorization</strong>: AVX2/AVX-512 instructions process 4-8 floats simultaneously</p></li>
<li><p><strong>Multi-threading</strong>: Automatic parallelization across CPU cores (OpenMP)</p></li>
<li><p><strong>Cache blocking</strong>: Tiled algorithms that keep working sets in L1/L2 cache</p></li>
</ul>
<div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="k">def</span><span class="w"> </span><span class="nf">vectorized_matmul</span><span class="p">(</span><span class="n">a</span><span class="p">:</span> <span class="n">Tensor</span><span class="p">,</span> <span class="n">b</span><span class="p">:</span> <span class="n">Tensor</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Tensor</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> High-performance matrix multiplication using optimized BLAS.</span>
<span class="sd"> NumPy&#39;s matmul calls GEMM (General Matrix Multiply) from:</span>
<span class="sd"> - Intel MKL (Math Kernel Library) - 200-500 GFLOP/s on modern CPUs</span>
<span class="sd"> - OpenBLAS - 100-300 GFLOP/s</span>
<span class="sd"> - Apple Accelerate - optimized for M1/M2 chips</span>
<span class="sd"> These libraries implement decades of optimization research.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="c1"># Input validation</span>
<span class="k">if</span> <span class="n">a</span><span class="o">.</span><span class="n">shape</span><span class="p">[</span><span class="o">-</span><span class="mi">1</span><span class="p">]</span> <span class="o">!=</span> <span class="n">b</span><span class="o">.</span><span class="n">shape</span><span class="p">[</span><span class="o">-</span><span class="mi">2</span><span class="p">]:</span>
<span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span><span class="sa">f</span><span class="s2">&quot;Shape mismatch: </span><span class="si">{</span><span class="n">a</span><span class="o">.</span><span class="n">shape</span><span class="si">}</span><span class="s2"> @ </span><span class="si">{</span><span class="n">b</span><span class="o">.</span><span class="n">shape</span><span class="si">}</span><span class="s2">&quot;</span><span class="p">)</span>
<span class="c1"># Delegate to highly optimized BLAS implementation</span>
<span class="c1"># This single line replaces thousands of lines of hand-tuned assembly</span>
<span class="n">result_data</span> <span class="o">=</span> <span class="n">np</span><span class="o">.</span><span class="n">matmul</span><span class="p">(</span><span class="n">a</span><span class="o">.</span><span class="n">data</span><span class="p">,</span> <span class="n">b</span><span class="o">.</span><span class="n">data</span><span class="p">)</span>
<span class="k">return</span> <span class="n">Tensor</span><span class="p">(</span><span class="n">result_data</span><span class="p">)</span>
</pre></div>
</div>
<p><strong>Performance Characteristics</strong>:</p>
<ul class="simple">
<li><p><strong>Small matrices</strong> (N &lt; 64): 10-30 GFLOP/s, limited by overhead</p></li>
<li><p><strong>Medium matrices</strong> (N = 64-512): 100-300 GFLOP/s, optimal cache reuse</p></li>
<li><p><strong>Large matrices</strong> (N &gt; 1024): 200-500 GFLOP/s, memory bandwidth saturated</p></li>
</ul>
<p><strong>Measured Speedups</strong> (vs. naive triple loop):</p>
<ul class="simple">
<li><p>128×128: <strong>50x faster</strong> (5ms → 0.1ms)</p></li>
<li><p>512×512: <strong>120x faster</strong> (800ms → 6.5ms)</p></li>
<li><p>2048×2048: <strong>150x faster</strong> (100s → 0.67s)</p></li>
</ul>
</section>
<section id="kernel-fusion-eliminating-memory-traffic">
<h3>2. Kernel Fusion: Eliminating Memory Traffic<a class="headerlink" href="#kernel-fusion-eliminating-memory-traffic" title="Link to this heading">#</a></h3>
<p><strong>The Problem</strong>: Element-wise operations are memory-bound. Consider GELU activation:</p>
<div class="highlight-default notranslate"><div class="highlight"><pre><span></span>GELU(x) = 0.5 * x * (1 + tanh(√(2/π) * (x + 0.044715 * x³)))
</pre></div>
</div>
<p><strong>Unfused implementation</strong> (naive):</p>
<div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="n">temp1</span> <span class="o">=</span> <span class="n">x</span> <span class="o">**</span> <span class="mi">3</span> <span class="c1"># Read x, write temp1</span>
<span class="n">temp2</span> <span class="o">=</span> <span class="mf">0.044715</span> <span class="o">*</span> <span class="n">temp1</span> <span class="c1"># Read temp1, write temp2</span>
<span class="n">temp3</span> <span class="o">=</span> <span class="n">x</span> <span class="o">+</span> <span class="n">temp2</span> <span class="c1"># Read x, temp2, write temp3</span>
<span class="n">temp4</span> <span class="o">=</span> <span class="n">sqrt_2_pi</span> <span class="o">*</span> <span class="n">temp3</span> <span class="c1"># Read temp3, write temp4</span>
<span class="n">temp5</span> <span class="o">=</span> <span class="n">tanh</span><span class="p">(</span><span class="n">temp4</span><span class="p">)</span> <span class="c1"># Read temp4, write temp5</span>
<span class="n">temp6</span> <span class="o">=</span> <span class="mf">1.0</span> <span class="o">+</span> <span class="n">temp5</span> <span class="c1"># Read temp5, write temp6</span>
<span class="n">temp7</span> <span class="o">=</span> <span class="n">x</span> <span class="o">*</span> <span class="n">temp6</span> <span class="c1"># Read x, temp6, write temp7</span>
<span class="n">result</span> <span class="o">=</span> <span class="mf">0.5</span> <span class="o">*</span> <span class="n">temp7</span> <span class="c1"># Read temp7, write result</span>
<span class="c1"># Total: 8 reads + 8 writes = 16 memory operations per element</span>
</pre></div>
</div>
<p><strong>Fused implementation</strong>:</p>
<div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="k">def</span><span class="w"> </span><span class="nf">fused_gelu</span><span class="p">(</span><span class="n">x</span><span class="p">:</span> <span class="n">Tensor</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Tensor</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Fused GELU activation - all operations in single expression.</span>
<span class="sd"> Memory efficiency:</span>
<span class="sd"> - Unfused: 16 memory ops per element</span>
<span class="sd"> - Fused: 2 memory ops per element (read x, write result)</span>
<span class="sd"> - Reduction: 87.5% less memory traffic</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">sqrt_2_over_pi</span> <span class="o">=</span> <span class="n">np</span><span class="o">.</span><span class="n">sqrt</span><span class="p">(</span><span class="mf">2.0</span> <span class="o">/</span> <span class="n">np</span><span class="o">.</span><span class="n">pi</span><span class="p">)</span>
<span class="c1"># Single expression - NumPy optimizes into minimal memory operations</span>
<span class="n">result_data</span> <span class="o">=</span> <span class="mf">0.5</span> <span class="o">*</span> <span class="n">x</span><span class="o">.</span><span class="n">data</span> <span class="o">*</span> <span class="p">(</span>
<span class="mf">1.0</span> <span class="o">+</span> <span class="n">np</span><span class="o">.</span><span class="n">tanh</span><span class="p">(</span><span class="n">sqrt_2_over_pi</span> <span class="o">*</span> <span class="p">(</span><span class="n">x</span><span class="o">.</span><span class="n">data</span> <span class="o">+</span> <span class="mf">0.044715</span> <span class="o">*</span> <span class="n">x</span><span class="o">.</span><span class="n">data</span><span class="o">**</span><span class="mi">3</span><span class="p">))</span>
<span class="p">)</span>
<span class="k">return</span> <span class="n">Tensor</span><span class="p">(</span><span class="n">result_data</span><span class="p">)</span>
</pre></div>
</div>
<p><strong>Measured Performance</strong> (2000×2000 tensor):</p>
<ul class="simple">
<li><p>Unfused: 45ms (7 temporary arrays allocated)</p></li>
<li><p>Fused: 18ms (0 temporary arrays)</p></li>
<li><p><strong>Speedup: 2.5x faster</strong> through memory bandwidth reduction alone</p></li>
</ul>
<p><strong>Memory Usage</strong>:</p>
<ul class="simple">
<li><p>Unfused: ~320MB (8 arrays × 2000×2000 × 4 bytes × overhead)</p></li>
<li><p>Fused: ~32MB (input + output only)</p></li>
<li><p><strong>Memory reduction: 90%</strong></p></li>
</ul>
</section>
<section id="cache-aware-tiling-blocked-algorithms">
<h3>3. Cache-Aware Tiling (Blocked Algorithms)<a class="headerlink" href="#cache-aware-tiling-blocked-algorithms" title="Link to this heading">#</a></h3>
<p><strong>The Memory Hierarchy</strong>:</p>
<div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="n">L1</span> <span class="n">Cache</span><span class="p">:</span> <span class="mi">32</span><span class="o">-</span><span class="mi">64</span> <span class="n">KB</span> <span class="mi">1</span><span class="o">-</span><span class="mi">4</span> <span class="n">cycles</span> <span class="o">~</span><span class="mi">1</span> <span class="n">TB</span><span class="o">/</span><span class="n">s</span> <span class="n">bandwidth</span>
<span class="n">L2</span> <span class="n">Cache</span><span class="p">:</span> <span class="mi">256</span><span class="n">KB</span><span class="o">-</span><span class="mi">1</span><span class="n">MB</span> <span class="mi">10</span><span class="o">-</span><span class="mi">20</span> <span class="n">cycles</span> <span class="o">~</span><span class="mi">500</span> <span class="n">GB</span><span class="o">/</span><span class="n">s</span> <span class="n">bandwidth</span>
<span class="n">L3</span> <span class="n">Cache</span><span class="p">:</span> <span class="mi">8</span><span class="o">-</span><span class="mi">32</span> <span class="n">MB</span> <span class="mi">40</span><span class="o">-</span><span class="mi">75</span> <span class="n">cycles</span> <span class="o">~</span><span class="mi">200</span> <span class="n">GB</span><span class="o">/</span><span class="n">s</span> <span class="n">bandwidth</span>
<span class="n">Main</span> <span class="n">RAM</span><span class="p">:</span> <span class="mi">8</span><span class="o">-</span><span class="mi">64</span> <span class="n">GB</span> <span class="mi">100</span><span class="o">-</span><span class="mi">300</span> <span class="n">cycles</span> <span class="o">~</span><span class="mi">20</span><span class="o">-</span><span class="mi">50</span> <span class="n">GB</span><span class="o">/</span><span class="n">s</span> <span class="n">bandwidth</span>
</pre></div>
</div>
<p><strong>The Problem</strong>: Naive matrix multiplication for 2048×2048 matrices accesses:</p>
<ul class="simple">
<li><p>Data size: 3 × 2048² × 4 bytes = 50MB (doesnt fit in L1/L2 cache)</p></li>
<li><p>Result: Most accesses hit L3 or RAM (100-300 cycle latency)</p></li>
</ul>
<p><strong>The Solution</strong>: Block/tile matrices into cache-sized chunks</p>
<p><strong>Conceptual Tiled Algorithm</strong>:</p>
<div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="k">def</span><span class="w"> </span><span class="nf">tiled_matmul_concept</span><span class="p">(</span><span class="n">A</span><span class="p">,</span> <span class="n">B</span><span class="p">,</span> <span class="n">tile_size</span><span class="o">=</span><span class="mi">64</span><span class="p">):</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Conceptual tiling algorithm (educational).</span>
<span class="sd"> In practice, BLAS libraries implement this automatically</span>
<span class="sd"> with hardware-specific tuning for optimal tile sizes.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">N</span> <span class="o">=</span> <span class="n">A</span><span class="o">.</span><span class="n">shape</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span>
<span class="n">C</span> <span class="o">=</span> <span class="n">np</span><span class="o">.</span><span class="n">zeros</span><span class="p">((</span><span class="n">N</span><span class="p">,</span> <span class="n">N</span><span class="p">))</span>
<span class="c1"># Process matrix in tile_size × tile_size blocks</span>
<span class="k">for</span> <span class="n">i</span> <span class="ow">in</span> <span class="nb">range</span><span class="p">(</span><span class="mi">0</span><span class="p">,</span> <span class="n">N</span><span class="p">,</span> <span class="n">tile_size</span><span class="p">):</span>
<span class="k">for</span> <span class="n">j</span> <span class="ow">in</span> <span class="nb">range</span><span class="p">(</span><span class="mi">0</span><span class="p">,</span> <span class="n">N</span><span class="p">,</span> <span class="n">tile_size</span><span class="p">):</span>
<span class="k">for</span> <span class="n">k</span> <span class="ow">in</span> <span class="nb">range</span><span class="p">(</span><span class="mi">0</span><span class="p">,</span> <span class="n">N</span><span class="p">,</span> <span class="n">tile_size</span><span class="p">):</span>
<span class="c1"># This block fits in L1/L2 cache (64×64×4 = 16KB)</span>
<span class="c1"># All accesses hit fast cache instead of slow RAM</span>
<span class="n">i_end</span> <span class="o">=</span> <span class="nb">min</span><span class="p">(</span><span class="n">i</span> <span class="o">+</span> <span class="n">tile_size</span><span class="p">,</span> <span class="n">N</span><span class="p">)</span>
<span class="n">j_end</span> <span class="o">=</span> <span class="nb">min</span><span class="p">(</span><span class="n">j</span> <span class="o">+</span> <span class="n">tile_size</span><span class="p">,</span> <span class="n">N</span><span class="p">)</span>
<span class="n">k_end</span> <span class="o">=</span> <span class="nb">min</span><span class="p">(</span><span class="n">k</span> <span class="o">+</span> <span class="n">tile_size</span><span class="p">,</span> <span class="n">N</span><span class="p">)</span>
<span class="n">C</span><span class="p">[</span><span class="n">i</span><span class="p">:</span><span class="n">i_end</span><span class="p">,</span> <span class="n">j</span><span class="p">:</span><span class="n">j_end</span><span class="p">]</span> <span class="o">+=</span> <span class="n">A</span><span class="p">[</span><span class="n">i</span><span class="p">:</span><span class="n">i_end</span><span class="p">,</span> <span class="n">k</span><span class="p">:</span><span class="n">k_end</span><span class="p">]</span> <span class="o">@</span> <span class="n">B</span><span class="p">[</span><span class="n">k</span><span class="p">:</span><span class="n">k_end</span><span class="p">,</span> <span class="n">j</span><span class="p">:</span><span class="n">j_end</span><span class="p">]</span>
<span class="k">return</span> <span class="n">C</span>
</pre></div>
</div>
<p><strong>Cache Efficiency Analysis</strong>:</p>
<ul class="simple">
<li><p><strong>Naive algorithm</strong>: 99% L3/RAM accesses (slow)</p></li>
<li><p><strong>Blocked algorithm</strong> (64×64 tiles): 95% L1/L2 hits (fast)</p></li>
<li><p><strong>Latency reduction</strong>: 300 cycles → 10 cycles average</p></li>
<li><p><strong>Effective speedup</strong>: 2-5x for large matrices</p></li>
</ul>
<p><strong>Optimal Tile Sizes</strong> (empirically determined):</p>
<ul class="simple">
<li><p>L1-focused: 32×32 (4KB per block)</p></li>
<li><p>L2-focused: 64×64 (16KB per block) ← sweet spot for most CPUs</p></li>
<li><p>L3-focused: 128×128 (64KB per block)</p></li>
</ul>
<p>Note: In this module, we use NumPys <code class="docutils literal notranslate"><span class="pre">matmul</span></code> which delegates to BLAS libraries (MKL, OpenBLAS) that already implement sophisticated cache blocking with hardware-specific tuning. Production implementations use tile sizes, loop unrolling, and prefetching tuned for specific CPU architectures.</p>
</section>
<section id="roofline-analysis-in-practice">
<h3>4. Roofline Analysis in Practice<a class="headerlink" href="#roofline-analysis-in-practice" title="Link to this heading">#</a></h3>
<p><strong>Measuring Your Hardware</strong>:</p>
<div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="k">def</span><span class="w"> </span><span class="nf">analyze_arithmetic_intensity</span><span class="p">():</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;Measure actual performance vs. theoretical roofline.&quot;&quot;&quot;</span>
<span class="c1"># Theoretical hardware limits (example: modern Intel CPU)</span>
<span class="n">peak_compute</span> <span class="o">=</span> <span class="mi">400</span> <span class="c1"># GFLOP/s (AVX-512, 8 cores, 3.5 GHz)</span>
<span class="n">peak_bandwidth</span> <span class="o">=</span> <span class="mi">45</span> <span class="c1"># GB/s (DDR4-2666, dual-channel)</span>
<span class="n">operations</span> <span class="o">=</span> <span class="p">{</span>
<span class="s2">&quot;Element-wise add&quot;</span><span class="p">:</span> <span class="p">{</span>
<span class="s2">&quot;flops&quot;</span><span class="p">:</span> <span class="n">N</span><span class="p">,</span>
<span class="s2">&quot;bytes&quot;</span><span class="p">:</span> <span class="mi">3</span> <span class="o">*</span> <span class="n">N</span> <span class="o">*</span> <span class="mi">4</span><span class="p">,</span>
<span class="s2">&quot;ai&quot;</span><span class="p">:</span> <span class="mf">0.08</span> <span class="c1"># FLOPs/byte</span>
<span class="p">},</span>
<span class="s2">&quot;Matrix multiply&quot;</span><span class="p">:</span> <span class="p">{</span>
<span class="s2">&quot;flops&quot;</span><span class="p">:</span> <span class="mi">2</span> <span class="o">*</span> <span class="n">N</span><span class="o">**</span><span class="mi">3</span><span class="p">,</span>
<span class="s2">&quot;bytes&quot;</span><span class="p">:</span> <span class="mi">3</span> <span class="o">*</span> <span class="n">N</span><span class="o">**</span><span class="mi">2</span> <span class="o">*</span> <span class="mi">4</span><span class="p">,</span>
<span class="s2">&quot;ai&quot;</span><span class="p">:</span> <span class="n">N</span> <span class="o">/</span> <span class="mi">6</span> <span class="c1"># For N=1024: 171 FLOPs/byte</span>
<span class="p">}</span>
<span class="p">}</span>
<span class="c1"># Predicted performance = min(peak_compute, ai × peak_bandwidth)</span>
<span class="k">for</span> <span class="n">op</span><span class="p">,</span> <span class="n">metrics</span> <span class="ow">in</span> <span class="n">operations</span><span class="o">.</span><span class="n">items</span><span class="p">():</span>
<span class="n">predicted_gflops</span> <span class="o">=</span> <span class="nb">min</span><span class="p">(</span>
<span class="n">peak_compute</span><span class="p">,</span>
<span class="n">metrics</span><span class="p">[</span><span class="s2">&quot;ai&quot;</span><span class="p">]</span> <span class="o">*</span> <span class="n">peak_bandwidth</span>
<span class="p">)</span>
<span class="nb">print</span><span class="p">(</span><span class="sa">f</span><span class="s2">&quot;</span><span class="si">{</span><span class="n">op</span><span class="si">}</span><span class="s2">: </span><span class="si">{</span><span class="n">predicted_gflops</span><span class="si">:</span><span class="s2">.1f</span><span class="si">}</span><span class="s2"> GFLOP/s (predicted)&quot;</span><span class="p">)</span>
</pre></div>
</div>
<p><strong>Example Analysis</strong> (N=1024):</p>
<div class="pst-scrollable-table-container"><table class="table">
<thead>
<tr class="row-odd"><th class="head"><p>Operation</p></th>
<th class="head"><p>AI (FLOPs/byte)</p></th>
<th class="head"><p>Predicted GFLOP/s</p></th>
<th class="head"><p>Measured GFLOP/s</p></th>
<th class="head"><p>Efficiency</p></th>
</tr>
</thead>
<tbody>
<tr class="row-even"><td><p>Element-wise add</p></td>
<td><p>0.08</p></td>
<td><p>3.6 (memory-bound)</p></td>
<td><p>3.2</p></td>
<td><p>89%</p></td>
</tr>
<tr class="row-odd"><td><p>GELU (fused)</p></td>
<td><p>1.0</p></td>
<td><p>45 (memory-bound)</p></td>
<td><p>38</p></td>
<td><p>84%</p></td>
</tr>
<tr class="row-even"><td><p>Matrix multiply</p></td>
<td><p>171</p></td>
<td><p>400 (compute-bound)</p></td>
<td><p>320</p></td>
<td><p>80%</p></td>
</tr>
</tbody>
</table>
</div>
<p><strong>Key Insights</strong>:</p>
<ul class="simple">
<li><p>Element-wise operations hit <strong>memory roof</strong> at 3-4 GFLOP/s (only 1% of peak compute)</p></li>
<li><p>Fusion improves AI by reducing memory operations (0.08 → 1.0 AI)</p></li>
<li><p>Matrix multiplication approaches <strong>compute roof</strong> (80% of peak)</p></li>
<li><p>Optimization strategy should focus on memory-bound operations first</p></li>
</ul>
</section>
</section>
<section id="getting-started">
<h2>Getting Started<a class="headerlink" href="#getting-started" title="Link to this heading">#</a></h2>
<section id="prerequisites">
<h3>Prerequisites<a class="headerlink" href="#prerequisites" title="Link to this heading">#</a></h3>
<p>Ensure youve completed:</p>
<ul class="simple">
<li><p><strong>Module 14 (Profiling)</strong>: Youll use profiling tools to measure acceleration gains</p></li>
<li><p><strong>Module 01 (Tensor)</strong>: Tensor class provides foundation for operations</p></li>
<li><p><strong>NumPy/BLAS</strong>: Verify optimized BLAS backend is installed</p></li>
</ul>
<p>Check your BLAS configuration:</p>
<div class="highlight-bash notranslate"><div class="highlight"><pre><span></span><span class="c1"># Activate TinyTorch environment</span>
<span class="nb">source</span><span class="w"> </span>scripts/activate-tinytorch
<span class="c1"># Check which BLAS backend NumPy uses</span>
python<span class="w"> </span>-c<span class="w"> </span><span class="s2">&quot;import numpy as np; np.show_config()&quot;</span>
<span class="c1"># Look for: openblas, mkl, or accelerate (Apple Silicon)</span>
<span class="c1"># MKL is fastest on Intel CPUs (200-500 GFLOP/s)</span>
<span class="c1"># OpenBLAS is good cross-platform (100-300 GFLOP/s)</span>
</pre></div>
</div>
<p>Verify prerequisite modules work:</p>
<div class="highlight-bash notranslate"><div class="highlight"><pre><span></span>tito<span class="w"> </span><span class="nb">test</span><span class="w"> </span>tensor
tito<span class="w"> </span><span class="nb">test</span><span class="w"> </span>profiling
</pre></div>
</div>
</section>
<section id="development-workflow">
<h3>Development Workflow<a class="headerlink" href="#development-workflow" title="Link to this heading">#</a></h3>
<ol class="arabic">
<li><p><strong>Open the development file</strong>: <code class="docutils literal notranslate"><span class="pre">modules/18_acceleration/acceleration.py</span></code></p></li>
<li><p><strong>Implement vectorized matrix multiplication</strong>:</p>
<ul class="simple">
<li><p>Validate input shapes for compatibility</p></li>
<li><p>Delegate to <code class="docutils literal notranslate"><span class="pre">np.matmul</span></code> (calls optimized BLAS GEMM)</p></li>
<li><p>Return result wrapped in Tensor</p></li>
<li><p>Test correctness and measure speedup vs. naive loops</p></li>
</ul>
</li>
<li><p><strong>Build fused GELU activation</strong>:</p>
<ul class="simple">
<li><p>Implement complete GELU formula in single expression</p></li>
<li><p>Avoid creating intermediate Tensor objects</p></li>
<li><p>Test numerical correctness against reference implementation</p></li>
<li><p>Measure memory bandwidth reduction</p></li>
</ul>
</li>
<li><p><strong>Create tiled matrix multiplication</strong>:</p>
<ul class="simple">
<li><p>Understand cache blocking concept (educational)</p></li>
<li><p>Use NumPys matmul which implements tiling internally</p></li>
<li><p>Analyze cache hit rates and memory access patterns</p></li>
<li><p>Compare performance across different matrix sizes</p></li>
</ul>
</li>
<li><p><strong>Perform roofline analysis</strong>:</p>
<ul class="simple">
<li><p>Measure FLOPs and memory bandwidth for each operation</p></li>
<li><p>Calculate arithmetic intensity</p></li>
<li><p>Plot operations on roofline model</p></li>
<li><p>Identify optimization priorities</p></li>
</ul>
</li>
<li><p><strong>Export and verify</strong>:</p>
<div class="highlight-bash notranslate"><div class="highlight"><pre><span></span>tito<span class="w"> </span>module<span class="w"> </span><span class="nb">complete</span><span class="w"> </span><span class="m">18</span>
tito<span class="w"> </span><span class="nb">test</span><span class="w"> </span>acceleration
</pre></div>
</div>
</li>
</ol>
</section>
</section>
<section id="testing">
<h2>Testing<a class="headerlink" href="#testing" title="Link to this heading">#</a></h2>
<section id="comprehensive-test-suite">
<h3>Comprehensive Test Suite<a class="headerlink" href="#comprehensive-test-suite" title="Link to this heading">#</a></h3>
<p>Run the full test suite to verify acceleration functionality:</p>
<div class="highlight-bash notranslate"><div class="highlight"><pre><span></span><span class="c1"># TinyTorch CLI (recommended)</span>
tito<span class="w"> </span><span class="nb">test</span><span class="w"> </span>acceleration
<span class="c1"># Direct pytest execution</span>
python<span class="w"> </span>-m<span class="w"> </span>pytest<span class="w"> </span>tests/<span class="w"> </span>-k<span class="w"> </span>acceleration<span class="w"> </span>-v
<span class="c1"># Run development file directly (includes inline tests)</span>
python<span class="w"> </span>modules/18_acceleration/acceleration.py
</pre></div>
</div>
</section>
<section id="test-coverage-areas">
<h3>Test Coverage Areas<a class="headerlink" href="#test-coverage-areas" title="Link to this heading">#</a></h3>
<ul class="simple">
<li><p><strong>Vectorized Operations Correctness</strong>: Matrix multiplication produces numerically correct results, handles batching and broadcasting, validates incompatible shapes</p></li>
<li><p><strong>Kernel Fusion Correctness</strong>: Fused GELU matches reference implementation, handles extreme values without NaN/Inf, preserves data types and shapes</p></li>
<li><p><strong>Performance Validation</strong>: Vectorized matmul achieves 10-150x speedup over naive loops, kernel fusion provides 2-5x speedup and 60-80% memory reduction, performance scales appropriately with tensor size</p></li>
<li><p><strong>Integration Testing</strong>: Acceleration techniques work together in realistic transformer blocks, profiler integration measures speedups correctly, memory efficiency validated with tracemalloc</p></li>
<li><p><strong>Roofline Analysis</strong>: Arithmetic intensity calculated correctly for different operations, performance predictions match measurements within 20%, memory-bound vs. compute-bound classification accurate</p></li>
</ul>
</section>
<section id="inline-testing-performance-analysis">
<h3>Inline Testing &amp; Performance Analysis<a class="headerlink" href="#inline-testing-performance-analysis" title="Link to this heading">#</a></h3>
<p>The module includes comprehensive validation and measurement:</p>
<div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="c1"># Run all inline tests</span>
<span class="n">python</span> <span class="n">modules</span><span class="o">/</span><span class="mi">18</span><span class="n">_acceleration</span><span class="o">/</span><span class="n">acceleration</span><span class="o">.</span><span class="n">py</span>
<span class="c1"># Expected output:</span>
<span class="err">🔬</span> <span class="n">Unit</span> <span class="n">Test</span><span class="p">:</span> <span class="n">Vectorized</span> <span class="n">Matrix</span> <span class="n">Multiplication</span><span class="o">...</span>
<span class="err"></span> <span class="n">vectorized_matmul</span> <span class="n">works</span> <span class="n">correctly</span><span class="err">!</span>
<span class="err">🔬</span> <span class="n">Unit</span> <span class="n">Test</span><span class="p">:</span> <span class="n">Fused</span> <span class="n">GELU</span><span class="o">...</span>
<span class="err"></span> <span class="n">fused_gelu</span> <span class="n">works</span> <span class="n">correctly</span><span class="err">!</span>
<span class="err">🔬</span> <span class="n">Unit</span> <span class="n">Test</span><span class="p">:</span> <span class="n">Kernel</span> <span class="n">Fusion</span> <span class="n">Performance</span> <span class="n">Impact</span><span class="o">...</span>
<span class="err">📊</span> <span class="n">Kernel</span> <span class="n">Fusion</span> <span class="n">Performance</span> <span class="n">Analysis</span><span class="p">:</span>
<span class="n">Tensor</span> <span class="n">size</span><span class="p">:</span> <span class="mi">2000</span><span class="err">×</span><span class="mi">2000</span> <span class="o">=</span> <span class="mi">4</span><span class="p">,</span><span class="mi">000</span><span class="p">,</span><span class="mi">000</span> <span class="n">elements</span>
<span class="n">Unfused</span> <span class="n">time</span><span class="p">:</span> <span class="mf">45.23</span> <span class="n">ms</span>
<span class="n">Fused</span> <span class="n">time</span><span class="p">:</span> <span class="mf">18.12</span> <span class="n">ms</span>
<span class="n">Speedup</span><span class="p">:</span> <span class="mf">2.50</span><span class="err">×</span> <span class="n">faster</span>
<span class="n">Per</span><span class="o">-</span><span class="n">element</span><span class="p">:</span> <span class="mf">11.3</span> <span class="n">ns</span> <span class="err"></span> <span class="mf">4.5</span> <span class="n">ns</span>
<span class="n">Memory</span> <span class="n">efficiency</span><span class="p">:</span> <span class="mi">7</span><span class="err"></span><span class="mi">2</span> <span class="n">memory</span> <span class="n">ops</span>
<span class="n">Effective</span> <span class="n">bandwidth</span><span class="p">:</span> <span class="mf">15.2</span><span class="err"></span><span class="mf">38.5</span> <span class="n">GB</span><span class="o">/</span><span class="n">s</span>
<span class="err">🚀</span> <span class="n">Excellent</span><span class="err">!</span> <span class="n">Kernel</span> <span class="n">fusion</span> <span class="n">providing</span> <span class="n">significant</span> <span class="n">speedup</span>
<span class="err">📊</span> <span class="n">Analyzing</span> <span class="n">vectorization</span> <span class="n">scaling</span> <span class="n">behavior</span><span class="o">...</span>
<span class="err">┌─────────┬─────────────┬─────────────┬─────────────┬─────────────┐</span>
<span class="err"></span> <span class="n">Size</span> <span class="err"></span> <span class="n">Time</span> <span class="p">(</span><span class="n">ms</span><span class="p">)</span> <span class="err"></span> <span class="n">GFLOPS</span> <span class="err"></span> <span class="n">Bandwidth</span> <span class="err"></span> <span class="n">Efficiency</span> <span class="err"></span>
<span class="err">├─────────┼─────────────┼─────────────┼─────────────┼─────────────┤</span>
<span class="err"></span> <span class="mi">64</span> <span class="err"></span> <span class="mf">0.05</span> <span class="err"></span> <span class="mf">33.6</span> <span class="err"></span> <span class="mf">15.8</span> <span class="err"></span> <span class="mf">16.8</span> <span class="err"></span>
<span class="err"></span> <span class="mi">128</span> <span class="err"></span> <span class="mf">0.18</span> <span class="err"></span> <span class="mf">114.2</span> <span class="err"></span> <span class="mf">26.7</span> <span class="err"></span> <span class="mf">57.1</span> <span class="err"></span>
<span class="err"></span> <span class="mi">256</span> <span class="err"></span> <span class="mf">1.12</span> <span class="err"></span> <span class="mf">188.5</span> <span class="err"></span> <span class="mf">22.1</span> <span class="err"></span> <span class="mf">94.3</span> <span class="err"></span>
<span class="err"></span> <span class="mi">512</span> <span class="err"></span> <span class="mf">6.45</span> <span class="err"></span> <span class="mf">328.7</span> <span class="err"></span> <span class="mf">19.3</span> <span class="err"></span> <span class="mf">164.4</span> <span class="err"></span>
<span class="err"></span> <span class="mi">1024</span> <span class="err"></span> <span class="mf">42.18</span> <span class="err"></span> <span class="mf">405.1</span> <span class="err"></span> <span class="mf">16.1</span> <span class="err"></span> <span class="mf">202.6</span> <span class="err"></span>
<span class="err"></span> <span class="mi">2048</span> <span class="err"></span> <span class="mf">281.34</span> <span class="err"></span> <span class="mf">485.2</span> <span class="err"></span> <span class="mf">15.3</span> <span class="err"></span> <span class="mf">242.6</span> <span class="err"></span>
<span class="err">└─────────┴─────────────┴─────────────┴─────────────┴─────────────┘</span>
<span class="err">🧪</span> <span class="n">RUNNING</span> <span class="n">MODULE</span> <span class="n">INTEGRATION</span> <span class="n">TEST</span>
<span class="n">Running</span> <span class="n">unit</span> <span class="n">tests</span><span class="o">...</span>
<span class="err"></span> <span class="n">All</span> <span class="n">tests</span> <span class="n">passed</span><span class="err">!</span>
<span class="err">🎉</span> <span class="n">ALL</span> <span class="n">TESTS</span> <span class="n">PASSED</span><span class="err">!</span> <span class="n">Module</span> <span class="n">ready</span> <span class="k">for</span> <span class="n">export</span><span class="o">.</span>
</pre></div>
</div>
</section>
<section id="manual-testing-examples">
<h3>Manual Testing Examples<a class="headerlink" href="#manual-testing-examples" title="Link to this heading">#</a></h3>
<div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="kn">from</span><span class="w"> </span><span class="nn">modules.</span><span class="mi">18</span><span class="n">_acceleration</span><span class="o">.</span><span class="n">acceleration</span> <span class="kn">import</span><span class="w"> </span><span class="o">*</span>
<span class="c1"># Test vectorized matrix multiplication</span>
<span class="n">A</span> <span class="o">=</span> <span class="n">Tensor</span><span class="p">(</span><span class="n">np</span><span class="o">.</span><span class="n">random</span><span class="o">.</span><span class="n">randn</span><span class="p">(</span><span class="mi">512</span><span class="p">,</span> <span class="mi">512</span><span class="p">)</span><span class="o">.</span><span class="n">astype</span><span class="p">(</span><span class="n">np</span><span class="o">.</span><span class="n">float32</span><span class="p">))</span>
<span class="n">B</span> <span class="o">=</span> <span class="n">Tensor</span><span class="p">(</span><span class="n">np</span><span class="o">.</span><span class="n">random</span><span class="o">.</span><span class="n">randn</span><span class="p">(</span><span class="mi">512</span><span class="p">,</span> <span class="mi">512</span><span class="p">)</span><span class="o">.</span><span class="n">astype</span><span class="p">(</span><span class="n">np</span><span class="o">.</span><span class="n">float32</span><span class="p">))</span>
<span class="c1"># Measure performance</span>
<span class="kn">import</span><span class="w"> </span><span class="nn">time</span>
<span class="n">start</span> <span class="o">=</span> <span class="n">time</span><span class="o">.</span><span class="n">time</span><span class="p">()</span>
<span class="n">C</span> <span class="o">=</span> <span class="n">vectorized_matmul</span><span class="p">(</span><span class="n">A</span><span class="p">,</span> <span class="n">B</span><span class="p">)</span>
<span class="n">elapsed</span> <span class="o">=</span> <span class="n">time</span><span class="o">.</span><span class="n">time</span><span class="p">()</span> <span class="o">-</span> <span class="n">start</span>
<span class="c1"># Calculate metrics</span>
<span class="n">flops</span> <span class="o">=</span> <span class="mi">2</span> <span class="o">*</span> <span class="mi">512</span><span class="o">**</span><span class="mi">3</span> <span class="c1"># 268 million FLOPs</span>
<span class="n">gflops</span> <span class="o">=</span> <span class="n">flops</span> <span class="o">/</span> <span class="p">(</span><span class="n">elapsed</span> <span class="o">*</span> <span class="mf">1e9</span><span class="p">)</span>
<span class="nb">print</span><span class="p">(</span><span class="sa">f</span><span class="s2">&quot;Performance: </span><span class="si">{</span><span class="n">gflops</span><span class="si">:</span><span class="s2">.1f</span><span class="si">}</span><span class="s2"> GFLOP/s&quot;</span><span class="p">)</span>
<span class="nb">print</span><span class="p">(</span><span class="sa">f</span><span class="s2">&quot;Time: </span><span class="si">{</span><span class="n">elapsed</span><span class="o">*</span><span class="mi">1000</span><span class="si">:</span><span class="s2">.2f</span><span class="si">}</span><span class="s2"> ms&quot;</span><span class="p">)</span>
<span class="c1"># Test kernel fusion</span>
<span class="n">x</span> <span class="o">=</span> <span class="n">Tensor</span><span class="p">(</span><span class="n">np</span><span class="o">.</span><span class="n">random</span><span class="o">.</span><span class="n">randn</span><span class="p">(</span><span class="mi">1000</span><span class="p">,</span> <span class="mi">1000</span><span class="p">)</span><span class="o">.</span><span class="n">astype</span><span class="p">(</span><span class="n">np</span><span class="o">.</span><span class="n">float32</span><span class="p">))</span>
<span class="c1"># Compare fused vs unfused</span>
<span class="n">start</span> <span class="o">=</span> <span class="n">time</span><span class="o">.</span><span class="n">time</span><span class="p">()</span>
<span class="n">y_fused</span> <span class="o">=</span> <span class="n">fused_gelu</span><span class="p">(</span><span class="n">x</span><span class="p">)</span>
<span class="n">fused_time</span> <span class="o">=</span> <span class="n">time</span><span class="o">.</span><span class="n">time</span><span class="p">()</span> <span class="o">-</span> <span class="n">start</span>
<span class="n">start</span> <span class="o">=</span> <span class="n">time</span><span class="o">.</span><span class="n">time</span><span class="p">()</span>
<span class="n">y_unfused</span> <span class="o">=</span> <span class="n">unfused_gelu</span><span class="p">(</span><span class="n">x</span><span class="p">)</span>
<span class="n">unfused_time</span> <span class="o">=</span> <span class="n">time</span><span class="o">.</span><span class="n">time</span><span class="p">()</span> <span class="o">-</span> <span class="n">start</span>
<span class="nb">print</span><span class="p">(</span><span class="sa">f</span><span class="s2">&quot;Speedup: </span><span class="si">{</span><span class="n">unfused_time</span><span class="o">/</span><span class="n">fused_time</span><span class="si">:</span><span class="s2">.2f</span><span class="si">}</span><span class="s2">x&quot;</span><span class="p">)</span>
<span class="nb">print</span><span class="p">(</span><span class="sa">f</span><span class="s2">&quot;Numerically equivalent: </span><span class="si">{</span><span class="n">np</span><span class="o">.</span><span class="n">allclose</span><span class="p">(</span><span class="n">y_fused</span><span class="o">.</span><span class="n">data</span><span class="p">,</span><span class="w"> </span><span class="n">y_unfused</span><span class="o">.</span><span class="n">data</span><span class="p">)</span><span class="si">}</span><span class="s2">&quot;</span><span class="p">)</span>
<span class="c1"># Measure with profiler</span>
<span class="kn">from</span><span class="w"> </span><span class="nn">tinytorch.perf.profiling</span><span class="w"> </span><span class="kn">import</span> <span class="n">Profiler</span>
<span class="n">profiler</span> <span class="o">=</span> <span class="n">Profiler</span><span class="p">()</span>
<span class="k">class</span><span class="w"> </span><span class="nc">SimpleModel</span><span class="p">:</span>
<span class="k">def</span><span class="w"> </span><span class="fm">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
<span class="bp">self</span><span class="o">.</span><span class="n">weight</span> <span class="o">=</span> <span class="n">Tensor</span><span class="p">(</span><span class="n">np</span><span class="o">.</span><span class="n">random</span><span class="o">.</span><span class="n">randn</span><span class="p">(</span><span class="mi">256</span><span class="p">,</span> <span class="mi">256</span><span class="p">)</span><span class="o">.</span><span class="n">astype</span><span class="p">(</span><span class="n">np</span><span class="o">.</span><span class="n">float32</span><span class="p">))</span>
<span class="k">def</span><span class="w"> </span><span class="nf">forward</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">x</span><span class="p">):</span>
<span class="k">return</span> <span class="n">fused_gelu</span><span class="p">(</span><span class="n">vectorized_matmul</span><span class="p">(</span><span class="n">x</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">weight</span><span class="p">))</span>
<span class="n">model</span> <span class="o">=</span> <span class="n">SimpleModel</span><span class="p">()</span>
<span class="n">input_tensor</span> <span class="o">=</span> <span class="n">Tensor</span><span class="p">(</span><span class="n">np</span><span class="o">.</span><span class="n">random</span><span class="o">.</span><span class="n">randn</span><span class="p">(</span><span class="mi">32</span><span class="p">,</span> <span class="mi">256</span><span class="p">)</span><span class="o">.</span><span class="n">astype</span><span class="p">(</span><span class="n">np</span><span class="o">.</span><span class="n">float32</span><span class="p">))</span>
<span class="n">latency</span> <span class="o">=</span> <span class="n">profiler</span><span class="o">.</span><span class="n">measure_latency</span><span class="p">(</span><span class="n">model</span><span class="p">,</span> <span class="n">input_tensor</span><span class="p">,</span> <span class="n">warmup</span><span class="o">=</span><span class="mi">5</span><span class="p">,</span> <span class="n">iterations</span><span class="o">=</span><span class="mi">20</span><span class="p">)</span>
<span class="n">flops</span> <span class="o">=</span> <span class="n">profiler</span><span class="o">.</span><span class="n">count_flops</span><span class="p">(</span><span class="n">model</span><span class="p">,</span> <span class="p">(</span><span class="mi">32</span><span class="p">,</span> <span class="mi">256</span><span class="p">))</span>
<span class="nb">print</span><span class="p">(</span><span class="sa">f</span><span class="s2">&quot;Latency: </span><span class="si">{</span><span class="n">latency</span><span class="si">:</span><span class="s2">.2f</span><span class="si">}</span><span class="s2"> ms&quot;</span><span class="p">)</span>
<span class="nb">print</span><span class="p">(</span><span class="sa">f</span><span class="s2">&quot;FLOPs: </span><span class="si">{</span><span class="n">flops</span><span class="si">:</span><span class="s2">,</span><span class="si">}</span><span class="s2">&quot;</span><span class="p">)</span>
<span class="nb">print</span><span class="p">(</span><span class="sa">f</span><span class="s2">&quot;Throughput: </span><span class="si">{</span><span class="n">flops</span><span class="w"> </span><span class="o">/</span><span class="w"> </span><span class="p">(</span><span class="n">latency</span><span class="o">/</span><span class="mi">1000</span><span class="p">)</span><span class="w"> </span><span class="o">/</span><span class="w"> </span><span class="mf">1e9</span><span class="si">:</span><span class="s2">.2f</span><span class="si">}</span><span class="s2"> GFLOP/s&quot;</span><span class="p">)</span>
</pre></div>
</div>
</section>
</section>
<section id="systems-thinking-questions">
<h2>Systems Thinking Questions<a class="headerlink" href="#systems-thinking-questions" title="Link to this heading">#</a></h2>
<section id="real-world-applications">
<h3>Real-World Applications<a class="headerlink" href="#real-world-applications" title="Link to this heading">#</a></h3>
<ul class="simple">
<li><p><strong>Training Acceleration</strong>: How do vectorized operations reduce training time for transformers? Whats the speedup for attention computation (mostly matrix multiplies) vs. layer normalization (element-wise operations)?</p></li>
<li><p><strong>Inference Optimization</strong>: Why is kernel fusion more important for inference than training? How does batch size affect the benefit of vectorization vs. fusion?</p></li>
<li><p><strong>Hardware Selection</strong>: Given a model with 70% matrix multiplies and 30% element-wise operations, should you optimize for compute or memory bandwidth? How does this affect CPU vs. GPU selection?</p></li>
<li><p><strong>Cloud Cost Reduction</strong>: If vectorization provides 100x speedup on matrix operations that take 80% of training time, whats the overall training time reduction and cost savings?</p></li>
</ul>
</section>
<section id="roofline-analysis-foundations">
<h3>Roofline Analysis Foundations<a class="headerlink" href="#roofline-analysis-foundations" title="Link to this heading">#</a></h3>
<ul class="simple">
<li><p><strong>Arithmetic Intensity Calculation</strong>: For convolution with kernel size K×K, input channels C_in, output channels C_out, and spatial dimensions H×W, whats the arithmetic intensity? Is it compute-bound or memory-bound?</p></li>
<li><p><strong>Memory Hierarchy Impact</strong>: Why does cache blocking improve performance by 2-5x even though it performs the same FLOPs? Whats the latency difference between L1 cache hits (4 cycles) vs. RAM accesses (300 cycles)?</p></li>
<li><p><strong>BLAS Library Performance</strong>: Why does NumPys matmul achieve 200-500 GFLOP/s while naive Python loops achieve 1 GFLOP/s? What optimizations do BLAS libraries implement that interpreted Python cant?</p></li>
<li><p><strong>Batch Size Effects</strong>: How does batch size affect arithmetic intensity for matrix multiplication? Why do larger batches achieve higher GFLOP/s on the same hardware?</p></li>
</ul>
</section>
<section id="optimization-strategy-characteristics">
<h3>Optimization Strategy Characteristics<a class="headerlink" href="#optimization-strategy-characteristics" title="Link to this heading">#</a></h3>
<ul class="simple">
<li><p><strong>Memory-Bound Operations</strong>: Why does adding more CPU cores NOT improve element-wise addition performance? Whats the fundamental bottleneck, and how do you fix it?</p></li>
<li><p><strong>Kernel Fusion Trade-offs</strong>: Fused GELU reduces memory operations from 16 to 2 per element. Why doesnt this give 8x speedup? What other factors limit acceleration?</p></li>
<li><p><strong>Production Optimization Priority</strong>: Given profiling data showing 40% time in attention softmax (memory-bound), 30% in matmuls (compute-bound), and 30% in data loading (I/O-bound), which should you optimize first? Why?</p></li>
<li><p><strong>Cross-Platform Performance</strong>: Why do vectorized operations using BLAS achieve different speedups on Intel CPUs (MKL: 500 GFLOP/s) vs. AMD CPUs (OpenBLAS: 200 GFLOP/s) vs. Apple Silicon (Accelerate: 300 GFLOP/s)? Whats hardware-dependent vs. algorithmic?</p></li>
</ul>
</section>
</section>
<section id="ready-to-build">
<h2>Ready to Build?<a class="headerlink" href="#ready-to-build" title="Link to this heading">#</a></h2>
<p>Youre about to learn the hardware-aware optimization techniques that separate research prototypes from production ML systems. Understanding how to extract maximum performance from CPUs—through vectorization, cache optimization, and memory bandwidth reduction—is foundational knowledge for any ML engineer.</p>
<p>These arent just academic exercises. Every time you use PyTorch or TensorFlow, youre benefiting from these exact techniques implemented in their backend libraries. By building them yourself, youll understand:</p>
<ul class="simple">
<li><p>Why transformers (mostly matmuls) run efficiently on GPUs while RNNs (sequential operations) struggle</p></li>
<li><p>How to predict whether adding more hardware will help before spending cloud budget</p></li>
<li><p>When to optimize code vs. when to redesign algorithms for better arithmetic intensity</p></li>
<li><p>How to measure and validate performance improvements systematically</p></li>
</ul>
<p>The roofline model and arithmetic intensity analysis youll master here apply directly to GPUs, TPUs, and custom AI accelerators. Hardware changes, but the fundamental memory-vs-compute trade-offs remain constant. This module gives you the mental models and measurement tools to optimize on any platform.</p>
<p>Choose your preferred way to engage with this module:</p>
<div class="sd-container-fluid sd-sphinx-override sd-mb-4 docutils">
<div class="sd-row sd-row-cols-1 sd-row-cols-xs-1 sd-row-cols-sm-2 sd-row-cols-md-3 sd-row-cols-lg-3 docutils">
<div class="sd-col sd-d-flex-row docutils">
<div class="sd-card sd-sphinx-override sd-w-100 sd-shadow-sm sd-card-hover docutils">
<div class="sd-card-body docutils">
<div class="sd-card-title sd-font-weight-bold docutils">
🚀 Launch Binder</div>
<p class="sd-card-text">Run this module interactively in your browser. No installation required!</p>
</div>
<a class="sd-stretched-link sd-hide-link-text reference external" href="https://mybinder.org/v2/gh/mlsysbook/TinyTorch/main?filepath=modules/18_acceleration/acceleration_dev.ipynb"><span>https://mybinder.org/v2/gh/mlsysbook/TinyTorch/main?filepath=modules/18_acceleration/acceleration_dev.ipynb</span></a></div>
</div>
<div class="sd-col sd-d-flex-row docutils">
<div class="sd-card sd-sphinx-override sd-w-100 sd-shadow-sm sd-card-hover docutils">
<div class="sd-card-body docutils">
<div class="sd-card-title sd-font-weight-bold docutils">
⚡ Open in Colab</div>
<p class="sd-card-text">Use Google Colab for GPU access and cloud compute power.</p>
</div>
<a class="sd-stretched-link sd-hide-link-text reference external" href="https://colab.research.google.com/github/mlsysbook/TinyTorch/blob/main/modules/18_acceleration/acceleration_dev.ipynb"><span>https://colab.research.google.com/github/mlsysbook/TinyTorch/blob/main/modules/18_acceleration/acceleration_dev.ipynb</span></a></div>
</div>
<div class="sd-col sd-d-flex-row docutils">
<div class="sd-card sd-sphinx-override sd-w-100 sd-shadow-sm sd-card-hover docutils">
<div class="sd-card-body docutils">
<div class="sd-card-title sd-font-weight-bold docutils">
📖 View Source</div>
<p class="sd-card-text">Browse the Python source code and understand the implementation.</p>
</div>
<a class="sd-stretched-link sd-hide-link-text reference external" href="https://github.com/mlsysbook/TinyTorch/blob/main/modules/18_acceleration/acceleration.py"><span>https://github.com/mlsysbook/TinyTorch/blob/main/modules/18_acceleration/acceleration.py</span></a></div>
</div>
</div>
</div>
<div class="tip admonition">
<p class="admonition-title">💾 Save Your Progress</p>
<p><strong>Binder sessions are temporary!</strong> Download your completed notebook when done, or switch to local development for persistent work.</p>
</div>
<hr class="docutils" />
<div class="prev-next-area">
<a class="left-prev" href="../chapters/17_memoization_ABOUT.html" title="previous page">← Module 17: Memoization</a>
<a class="right-next" href="../chapters/19_benchmarking_ABOUT.html" title="next page">Module 19: Benchmarking →</a>
</div>
</section>
</section>
<script type="text/x-thebe-config">
{
requestKernel: true,
binderOptions: {
repo: "binder-examples/jupyter-stacks-datascience",
ref: "master",
},
codeMirrorConfig: {
theme: "abcdef",
mode: "python"
},
kernelOptions: {
name: "python3",
path: "./modules"
},
predefinedOutput: true
}
</script>
<script>kernelName = 'python3'</script>
</article>
<footer class="prev-next-footer d-print-none">
<div class="prev-next-area">
<a class="left-prev"
href="17_memoization_ABOUT.html"
title="previous page">
<i class="fa-solid fa-angle-left"></i>
<div class="prev-next-info">
<p class="prev-next-subtitle">previous</p>
<p class="prev-next-title">17. Memoization - Computational Reuse for Inference</p>
</div>
</a>
<a class="right-next"
href="19_benchmarking_ABOUT.html"
title="next page">
<div class="prev-next-info">
<p class="prev-next-subtitle">next</p>
<p class="prev-next-title">19. Benchmarking - Fair Performance Comparison</p>
</div>
<i class="fa-solid fa-angle-right"></i>
</a>
</div>
</footer>
</div>
<div class="bd-sidebar-secondary bd-toc"><div class="sidebar-secondary-items sidebar-secondary__inner">
<div class="sidebar-secondary-item">
<div class="page-toc tocsection onthispage">
<i class="fa-solid fa-list"></i> Contents
</div>
<nav class="bd-toc-nav page-toc">
<ul class="visible nav section-nav flex-column">
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#overview">Overview</a></li>
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#learning-objectives">Learning Objectives</a></li>
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#build-use-optimize">Build → Use → Optimize</a></li>
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#why-this-matters-the-hardware-reality">Why This Matters: The Hardware Reality</a><ul class="nav section-nav flex-column">
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#the-performance-gap">The Performance Gap</a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#from-naive-python-to-production-performance">From Naive Python to Production Performance</a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#real-world-impact">Real-World Impact</a></li>
</ul>
</li>
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#the-roofline-model-your-performance-compass">The Roofline Model: Your Performance Compass</a><ul class="nav section-nav flex-column">
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#understanding-hardware-limits">Understanding Hardware Limits</a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#example-calculations">Example Calculations</a></li>
</ul>
</li>
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#implementation-guide">Implementation Guide</a><ul class="nav section-nav flex-column">
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#vectorized-matrix-multiplication">1. Vectorized Matrix Multiplication</a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#kernel-fusion-eliminating-memory-traffic">2. Kernel Fusion: Eliminating Memory Traffic</a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#cache-aware-tiling-blocked-algorithms">3. Cache-Aware Tiling (Blocked Algorithms)</a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#roofline-analysis-in-practice">4. Roofline Analysis in Practice</a></li>
</ul>
</li>
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#getting-started">Getting Started</a><ul class="nav section-nav flex-column">
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#prerequisites">Prerequisites</a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#development-workflow">Development Workflow</a></li>
</ul>
</li>
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#testing">Testing</a><ul class="nav section-nav flex-column">
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#comprehensive-test-suite">Comprehensive Test Suite</a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#test-coverage-areas">Test Coverage Areas</a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#inline-testing-performance-analysis">Inline Testing &amp; Performance Analysis</a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#manual-testing-examples">Manual Testing Examples</a></li>
</ul>
</li>
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#systems-thinking-questions">Systems Thinking Questions</a><ul class="nav section-nav flex-column">
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#real-world-applications">Real-World Applications</a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#roofline-analysis-foundations">Roofline Analysis Foundations</a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#optimization-strategy-characteristics">Optimization Strategy Characteristics</a></li>
</ul>
</li>
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#ready-to-build">Ready to Build?</a></li>
</ul>
</nav></div>
</div></div>
</div>
<footer class="bd-footer-content">
<div class="bd-footer-content__inner container">
<div class="footer-item">
<p class="component-author">
By Prof. Vijay Janapa Reddi (Harvard University)
</p>
</div>
<div class="footer-item">
<p class="copyright">
© Copyright 2025.
<br/>
</p>
</div>
<div class="footer-item">
</div>
<div class="footer-item">
</div>
</div>
</footer>
</main>
</div>
</div>
<!-- Scripts loaded after <body> so the DOM is not blocked -->
<script src="../_static/scripts/bootstrap.js?digest=dfe6caa3a7d634c4db9b"></script>
<script src="../_static/scripts/pydata-sphinx-theme.js?digest=dfe6caa3a7d634c4db9b"></script>
<footer class="bd-footer">
</footer>
</body>
</html>