Files
TinyTorch/modules/19_benchmarking_ABOUT.html
2025-12-05 00:52:38 +00:00

1012 lines
68 KiB
HTML
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
<!DOCTYPE html>
<html lang="en" data-content_root="../" >
<head>
<meta charset="utf-8" />
<meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
<title>19. Benchmarking - Fair Performance Comparison &#8212; Tiny🔥Torch</title>
<script data-cfasync="false">
document.documentElement.dataset.mode = localStorage.getItem("mode") || "";
document.documentElement.dataset.theme = localStorage.getItem("theme") || "";
</script>
<!-- Loaded before other Sphinx assets -->
<link href="../_static/styles/theme.css?digest=dfe6caa3a7d634c4db9b" rel="stylesheet" />
<link href="../_static/styles/bootstrap.css?digest=dfe6caa3a7d634c4db9b" rel="stylesheet" />
<link href="../_static/styles/pydata-sphinx-theme.css?digest=dfe6caa3a7d634c4db9b" rel="stylesheet" />
<link href="../_static/vendor/fontawesome/6.5.2/css/all.min.css?digest=dfe6caa3a7d634c4db9b" rel="stylesheet" />
<link rel="preload" as="font" type="font/woff2" crossorigin href="../_static/vendor/fontawesome/6.5.2/webfonts/fa-solid-900.woff2" />
<link rel="preload" as="font" type="font/woff2" crossorigin href="../_static/vendor/fontawesome/6.5.2/webfonts/fa-brands-400.woff2" />
<link rel="preload" as="font" type="font/woff2" crossorigin href="../_static/vendor/fontawesome/6.5.2/webfonts/fa-regular-400.woff2" />
<link rel="stylesheet" type="text/css" href="../_static/pygments.css?v=03e43079" />
<link rel="stylesheet" type="text/css" href="../_static/styles/sphinx-book-theme.css?v=eba8b062" />
<link rel="stylesheet" type="text/css" href="../_static/togglebutton.css?v=13237357" />
<link rel="stylesheet" type="text/css" href="../_static/copybutton.css?v=76b2166b" />
<link rel="stylesheet" type="text/css" href="../_static/mystnb.8ecb98da25f57f5357bf6f572d296f466b2cfe2517ffebfabe82451661e28f02.css" />
<link rel="stylesheet" type="text/css" href="../_static/sphinx-thebe.css?v=4fa983c6" />
<link rel="stylesheet" type="text/css" href="../_static/sphinx-design.min.css?v=95c83b7e" />
<link rel="stylesheet" type="text/css" href="../_static/custom.css?v=009d37f4" />
<!-- Pre-loaded scripts that we'll load fully later -->
<link rel="preload" as="script" href="../_static/scripts/bootstrap.js?digest=dfe6caa3a7d634c4db9b" />
<link rel="preload" as="script" href="../_static/scripts/pydata-sphinx-theme.js?digest=dfe6caa3a7d634c4db9b" />
<script src="../_static/vendor/fontawesome/6.5.2/js/all.min.js?digest=dfe6caa3a7d634c4db9b"></script>
<script src="../_static/documentation_options.js?v=9eb32ce0"></script>
<script src="../_static/doctools.js?v=9a2dae69"></script>
<script src="../_static/sphinx_highlight.js?v=dc90522c"></script>
<script src="../_static/clipboard.min.js?v=a7894cd8"></script>
<script src="../_static/copybutton.js?v=f281be69"></script>
<script src="../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
<script>let toggleHintShow = 'Click to show';</script>
<script>let toggleHintHide = 'Click to hide';</script>
<script>let toggleOpenOnPrint = 'true';</script>
<script src="../_static/togglebutton.js?v=4a39c7ea"></script>
<script>var togglebuttonSelector = '.toggle, .admonition.dropdown';</script>
<script src="../_static/design-tabs.js?v=f930bc37"></script>
<script>const THEBE_JS_URL = "https://unpkg.com/thebe@0.8.2/lib/index.js"; const thebe_selector = ".thebe,.cell"; const thebe_selector_input = "pre"; const thebe_selector_output = ".output, .cell_output"</script>
<script async="async" src="../_static/sphinx-thebe.js?v=c100c467"></script>
<script>var togglebuttonSelector = '.toggle, .admonition.dropdown';</script>
<script>const THEBE_JS_URL = "https://unpkg.com/thebe@0.8.2/lib/index.js"; const thebe_selector = ".thebe,.cell"; const thebe_selector_input = "pre"; const thebe_selector_output = ".output, .cell_output"</script>
<script>DOCUMENTATION_OPTIONS.pagename = 'modules/19_benchmarking_ABOUT';</script>
<script src="../_static/ml-timeline.js?v=76e9b3e3"></script>
<script src="../_static/wip-banner.js?v=04a7e74d"></script>
<script src="../_static/marimo-badges.js?v=e6289128"></script>
<script src="../_static/sidebar-link.js?v=404b701b"></script>
<script src="../_static/hero-carousel.js?v=10341d2a"></script>
<script src="../_static/subscribe-modal.js?v=42919b64"></script>
<link rel="icon" href="../_static/favicon.svg"/>
<link rel="index" title="Index" href="../genindex.html" />
<link rel="search" title="Search" href="../search.html" />
<link rel="next" title="🏅 Torch Olympics (Module 20)" href="../tiers/olympics.html" />
<link rel="prev" title="18. Acceleration - CPU Vectorization &amp; Cache Optimization" href="18_acceleration_ABOUT.html" />
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<meta name="docsearch:language" content="en"/>
</head>
<body data-bs-spy="scroll" data-bs-target=".bd-toc-nav" data-offset="180" data-bs-root-margin="0px 0px -60%" data-default-mode="">
<div id="pst-skip-link" class="skip-link d-print-none"><a href="#main-content">Skip to main content</a></div>
<div id="pst-scroll-pixel-helper"></div>
<button type="button" class="btn rounded-pill" id="pst-back-to-top">
<i class="fa-solid fa-arrow-up"></i>Back to top</button>
<input type="checkbox"
class="sidebar-toggle"
id="pst-primary-sidebar-checkbox"/>
<label class="overlay overlay-primary" for="pst-primary-sidebar-checkbox"></label>
<input type="checkbox"
class="sidebar-toggle"
id="pst-secondary-sidebar-checkbox"/>
<label class="overlay overlay-secondary" for="pst-secondary-sidebar-checkbox"></label>
<div class="search-button__wrapper">
<div class="search-button__overlay"></div>
<div class="search-button__search-container">
<form class="bd-search d-flex align-items-center"
action="../search.html"
method="get">
<i class="fa-solid fa-magnifying-glass"></i>
<input type="search"
class="form-control"
name="q"
id="search-input"
placeholder="Search..."
aria-label="Search..."
autocomplete="off"
autocorrect="off"
autocapitalize="off"
spellcheck="false"/>
<span class="search-button__kbd-shortcut"><kbd class="kbd-shortcut__modifier">Ctrl</kbd>+<kbd>K</kbd></span>
</form></div>
</div>
<div class="pst-async-banner-revealer d-none">
<aside id="bd-header-version-warning" class="d-none d-print-none" aria-label="Version warning"></aside>
</div>
<header class="bd-header navbar navbar-expand-lg bd-navbar d-print-none">
</header>
<div class="bd-container">
<div class="bd-container__inner bd-page-width">
<div class="bd-sidebar-primary bd-sidebar">
<div class="sidebar-header-items sidebar-primary__section">
</div>
<div class="sidebar-primary-items__start sidebar-primary__section">
<div class="sidebar-primary-item">
<a class="navbar-brand logo" href="../intro.html">
<img src="../_static/logo-tinytorch.png" class="logo__image only-light" alt="Tiny🔥Torch - Home"/>
<script>document.write(`<img src="../_static/logo-tinytorch.png" class="logo__image only-dark" alt="Tiny🔥Torch - Home"/>`);</script>
</a></div>
<div class="sidebar-primary-item">
<script>
document.write(`
<button class="btn search-button-field search-button__button" title="Search" aria-label="Search" data-bs-placement="bottom" data-bs-toggle="tooltip">
<i class="fa-solid fa-magnifying-glass"></i>
<span class="search-button__default-text">Search</span>
<span class="search-button__kbd-shortcut"><kbd class="kbd-shortcut__modifier">Ctrl</kbd>+<kbd class="kbd-shortcut__modifier">K</kbd></span>
</button>
`);
</script></div>
<div class="sidebar-primary-item"><nav class="bd-links bd-docs-nav" aria-label="Main">
<div class="bd-toc-item navbar-nav active">
<p aria-level="2" class="caption" role="heading"><span class="caption-text">🚀 Getting Started</span></p>
<ul class="nav bd-sidenav">
<li class="toctree-l1"><a class="reference internal" href="../getting-started.html">Complete Guide</a></li>
</ul>
<p aria-level="2" class="caption" role="heading"><span class="caption-text">🏗 Foundation Tier (01-07)</span></p>
<ul class="nav bd-sidenav">
<li class="toctree-l1"><a class="reference internal" href="../tiers/foundation.html">📖 Tier Overview</a></li>
<li class="toctree-l1"><a class="reference internal" href="01_tensor_ABOUT.html">01. Tensor</a></li>
<li class="toctree-l1"><a class="reference internal" href="02_activations_ABOUT.html">02. Activations</a></li>
<li class="toctree-l1"><a class="reference internal" href="03_layers_ABOUT.html">03. Layers</a></li>
<li class="toctree-l1"><a class="reference internal" href="04_losses_ABOUT.html">04. Losses</a></li>
<li class="toctree-l1"><a class="reference internal" href="05_autograd_ABOUT.html">05. Autograd</a></li>
<li class="toctree-l1"><a class="reference internal" href="06_optimizers_ABOUT.html">06. Optimizers</a></li>
<li class="toctree-l1"><a class="reference internal" href="07_training_ABOUT.html">07. Training</a></li>
</ul>
<p aria-level="2" class="caption" role="heading"><span class="caption-text">🏛️ Architecture Tier (08-13)</span></p>
<ul class="nav bd-sidenav">
<li class="toctree-l1"><a class="reference internal" href="../tiers/architecture.html">📖 Tier Overview</a></li>
<li class="toctree-l1"><a class="reference internal" href="08_dataloader_ABOUT.html">08. DataLoader</a></li>
<li class="toctree-l1"><a class="reference internal" href="09_spatial_ABOUT.html">09. Convolutions</a></li>
<li class="toctree-l1"><a class="reference internal" href="10_tokenization_ABOUT.html">10. Tokenization</a></li>
<li class="toctree-l1"><a class="reference internal" href="11_embeddings_ABOUT.html">11. Embeddings</a></li>
<li class="toctree-l1"><a class="reference internal" href="12_attention_ABOUT.html">12. Attention</a></li>
<li class="toctree-l1"><a class="reference internal" href="13_transformers_ABOUT.html">13. Transformers</a></li>
</ul>
<p aria-level="2" class="caption" role="heading"><span class="caption-text">⏱️ Optimization Tier (14-19)</span></p>
<ul class="current nav bd-sidenav">
<li class="toctree-l1"><a class="reference internal" href="../tiers/optimization.html">📖 Tier Overview</a></li>
<li class="toctree-l1"><a class="reference internal" href="14_profiling_ABOUT.html">14. Profiling</a></li>
<li class="toctree-l1"><a class="reference internal" href="15_quantization_ABOUT.html">15. Quantization</a></li>
<li class="toctree-l1"><a class="reference internal" href="16_compression_ABOUT.html">16. Compression</a></li>
<li class="toctree-l1"><a class="reference internal" href="17_memoization_ABOUT.html">17. Memoization</a></li>
<li class="toctree-l1"><a class="reference internal" href="18_acceleration_ABOUT.html">18. Acceleration</a></li>
<li class="toctree-l1 current active"><a class="current reference internal" href="#">19. Benchmarking</a></li>
</ul>
<p aria-level="2" class="caption" role="heading"><span class="caption-text">🏅 Capstone Competition</span></p>
<ul class="nav bd-sidenav">
<li class="toctree-l1"><a class="reference internal" href="../tiers/olympics.html">📖 Competition Overview</a></li>
<li class="toctree-l1"><a class="reference internal" href="20_capstone_ABOUT.html">20. Torch Olympics</a></li>
</ul>
<p aria-level="2" class="caption" role="heading"><span class="caption-text">🧭 Course Orientation</span></p>
<ul class="nav bd-sidenav">
<li class="toctree-l1"><a class="reference internal" href="../chapters/00-introduction.html">Course Structure</a></li>
<li class="toctree-l1"><a class="reference internal" href="../prerequisites.html">Prerequisites &amp; Resources</a></li>
<li class="toctree-l1"><a class="reference internal" href="../chapters/learning-journey.html">Learning Journey</a></li>
<li class="toctree-l1"><a class="reference internal" href="../chapters/milestones.html">Historical Milestones</a></li>
<li class="toctree-l1"><a class="reference internal" href="../faq.html">FAQ</a></li>
</ul>
<p aria-level="2" class="caption" role="heading"><span class="caption-text">🛠️ TITO CLI Reference</span></p>
<ul class="nav bd-sidenav">
<li class="toctree-l1"><a class="reference internal" href="../tito/overview.html">Command Overview</a></li>
<li class="toctree-l1"><a class="reference internal" href="../tito/modules.html">Module Workflow</a></li>
<li class="toctree-l1"><a class="reference internal" href="../tito/milestones.html">Milestone System</a></li>
<li class="toctree-l1"><a class="reference internal" href="../tito/data.html">Progress &amp; Data</a></li>
<li class="toctree-l1"><a class="reference internal" href="../tito/troubleshooting.html">Troubleshooting</a></li>
<li class="toctree-l1"><a class="reference internal" href="../datasets.html">Datasets Guide</a></li>
</ul>
<p aria-level="2" class="caption" role="heading"><span class="caption-text">🤝 Community</span></p>
<ul class="nav bd-sidenav">
<li class="toctree-l1"><a class="reference internal" href="../community.html">Ecosystem</a></li>
<li class="toctree-l1"><a class="reference internal" href="../resources.html">Learning Resources</a></li>
<li class="toctree-l1"><a class="reference internal" href="../credits.html">Credits &amp; Acknowledgments</a></li>
</ul>
</div>
</nav></div>
</div>
<div class="sidebar-primary-items__end sidebar-primary__section">
</div>
<div id="rtd-footer-container"></div>
</div>
<main id="main-content" class="bd-main" role="main">
<div class="sbt-scroll-pixel-helper"></div>
<div class="bd-content">
<div class="bd-article-container">
<div class="bd-header-article d-print-none">
<div class="header-article-items header-article__inner">
<div class="header-article-items__start">
<div class="header-article-item"><button class="sidebar-toggle primary-toggle btn btn-sm" title="Toggle primary sidebar" data-bs-placement="bottom" data-bs-toggle="tooltip">
<span class="fa-solid fa-bars"></span>
</button></div>
</div>
<div class="header-article-items__end">
<div class="header-article-item">
<div class="article-header-buttons">
<div class="dropdown dropdown-download-buttons">
<button class="btn dropdown-toggle" type="button" data-bs-toggle="dropdown" aria-expanded="false" aria-label="Download this page">
<i class="fas fa-download"></i>
</button>
<ul class="dropdown-menu">
<li><a href="../_sources/modules/19_benchmarking_ABOUT.md" target="_blank"
class="btn btn-sm btn-download-source-button dropdown-item"
title="Download source file"
data-bs-placement="left" data-bs-toggle="tooltip"
>
<span class="btn__icon-container">
<i class="fas fa-file"></i>
</span>
<span class="btn__text-container">.md</span>
</a>
</li>
<li>
<button onclick="window.print()"
class="btn btn-sm btn-download-pdf-button dropdown-item"
title="Print to PDF"
data-bs-placement="left" data-bs-toggle="tooltip"
>
<span class="btn__icon-container">
<i class="fas fa-file-pdf"></i>
</span>
<span class="btn__text-container">.pdf</span>
</button>
</li>
</ul>
</div>
<button onclick="toggleFullScreen()"
class="btn btn-sm btn-fullscreen-button"
title="Fullscreen mode"
data-bs-placement="bottom" data-bs-toggle="tooltip"
>
<span class="btn__icon-container">
<i class="fas fa-expand"></i>
</span>
</button>
<script>
document.write(`
<button class="btn btn-sm nav-link pst-navbar-icon theme-switch-button" title="light/dark" aria-label="light/dark" data-bs-placement="bottom" data-bs-toggle="tooltip">
<i class="theme-switch fa-solid fa-sun fa-lg" data-mode="light"></i>
<i class="theme-switch fa-solid fa-moon fa-lg" data-mode="dark"></i>
<i class="theme-switch fa-solid fa-circle-half-stroke fa-lg" data-mode="auto"></i>
</button>
`);
</script>
<script>
document.write(`
<button class="btn btn-sm pst-navbar-icon search-button search-button__button" title="Search" aria-label="Search" data-bs-placement="bottom" data-bs-toggle="tooltip">
<i class="fa-solid fa-magnifying-glass fa-lg"></i>
</button>
`);
</script>
<button class="sidebar-toggle secondary-toggle btn btn-sm" title="Toggle secondary sidebar" data-bs-placement="bottom" data-bs-toggle="tooltip">
<span class="fa-solid fa-list"></span>
</button>
</div></div>
</div>
</div>
</div>
<div id="jb-print-docs-body" class="onlyprint">
<h1>19. Benchmarking - Fair Performance Comparison</h1>
<!-- Table of contents -->
<div id="print-main-content">
<div id="jb-print-toc">
<div>
<h2> Contents </h2>
</div>
<nav aria-label="Page">
<ul class="visible nav section-nav flex-column">
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#overview">Overview</a></li>
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#learning-objectives">Learning Objectives</a></li>
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#build-use-analyze">Build → Use → Analyze</a></li>
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#implementation-guide">Implementation Guide</a><ul class="nav section-nav flex-column">
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#core-benchmarking-components">Core Benchmarking Components</a><ul class="nav section-nav flex-column">
<li class="toc-h4 nav-item toc-entry"><a class="reference internal nav-link" href="#statistical-measurement-infrastructure">1. Statistical Measurement Infrastructure</a></li>
<li class="toc-h4 nav-item toc-entry"><a class="reference internal nav-link" href="#warmup-and-measurement-protocol">2. Warmup and Measurement Protocol</a></li>
<li class="toc-h4 nav-item toc-entry"><a class="reference internal nav-link" href="#normalized-metrics-for-fair-comparison">3. Normalized Metrics for Fair Comparison</a></li>
<li class="toc-h4 nav-item toc-entry"><a class="reference internal nav-link" href="#comprehensive-benchmark-suite">4. Comprehensive Benchmark Suite</a></li>
</ul>
</li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#real-world-benchmarking-principles">Real-World Benchmarking Principles</a><ul class="nav section-nav flex-column">
<li class="toc-h4 nav-item toc-entry"><a class="reference internal nav-link" href="#reproducibility-requirements">Reproducibility Requirements</a></li>
<li class="toc-h4 nav-item toc-entry"><a class="reference internal nav-link" href="#fair-comparison-protocol">Fair Comparison Protocol</a></li>
<li class="toc-h4 nav-item toc-entry"><a class="reference internal nav-link" href="#statistical-significance-testing">Statistical Significance Testing</a></li>
</ul>
</li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#connection-to-competition-workflow-module-20">Connection to Competition Workflow (Module 20)</a></li>
</ul>
</li>
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#getting-started">Getting Started</a><ul class="nav section-nav flex-column">
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#prerequisites">Prerequisites</a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#development-workflow">Development Workflow</a></li>
</ul>
</li>
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#testing">Testing</a><ul class="nav section-nav flex-column">
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#comprehensive-test-suite">Comprehensive Test Suite</a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#test-coverage-areas">Test Coverage Areas</a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#inline-testing-validation">Inline Testing &amp; Validation</a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#manual-testing-examples">Manual Testing Examples</a></li>
</ul>
</li>
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#systems-thinking-questions">Systems Thinking Questions</a><ul class="nav section-nav flex-column">
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#real-world-applications">Real-World Applications</a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#statistical-foundations">Statistical Foundations</a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#performance-characteristics">Performance Characteristics</a></li>
</ul>
</li>
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#ready-to-build">Ready to Build?</a></li>
</ul>
</nav>
</div>
</div>
</div>
<div id="searchbox"></div>
<article class="bd-article">
<section id="benchmarking-fair-performance-comparison">
<h1>19. Benchmarking - Fair Performance Comparison<a class="headerlink" href="#benchmarking-fair-performance-comparison" title="Link to this heading">#</a></h1>
<p><strong>OPTIMIZATION TIER</strong> | Difficulty: ⭐⭐⭐ (3/4) | Time: 5-6 hours</p>
<section id="overview">
<h2>Overview<a class="headerlink" href="#overview" title="Link to this heading">#</a></h2>
<p>Youll build a rigorous performance measurement system that enables fair comparison of all your optimizations. This module implements educational benchmarking with statistical testing, normalized metrics, and reproducible protocols. Your benchmarking framework provides the measurement methodology used in Module 20s competition workflow, where youll apply these tools to validate optimizations systematically.</p>
</section>
<section id="learning-objectives">
<h2>Learning Objectives<a class="headerlink" href="#learning-objectives" title="Link to this heading">#</a></h2>
<p>By the end of this module, you will be able to:</p>
<ul class="simple">
<li><p><strong>Understand benchmark design principles</strong>: Reproducibility requirements; representative workload selection; measurement methodology; controlling for confounding variables; fair comparison protocols</p></li>
<li><p><strong>Implement statistical rigor</strong>: Multiple runs with warmup periods; confidence interval calculation; variance reporting not just means; understanding measurement uncertainty; detecting outliers</p></li>
<li><p><strong>Master fair comparison protocols</strong>: Hardware normalization strategies; environmental controls (thermal, OS noise); baseline selection criteria; same workload/data/environment enforcement; apples-to-apples measurement</p></li>
<li><p><strong>Build normalized metrics systems</strong>: Speedup ratios (baseline_time / optimized_time); compression factors (original_size / compressed_size); accuracy preservation tracking; efficiency scores combining multiple objectives; hardware-independent reporting</p></li>
<li><p><strong>Analyze measurement trade-offs</strong>: Benchmark coverage vs runtime cost; statistical power vs sample size requirements; reproducibility vs realism; instrumentation overhead (observer effect); when 5% speedup is significant vs noise</p></li>
</ul>
</section>
<section id="build-use-analyze">
<h2>Build → Use → Analyze<a class="headerlink" href="#build-use-analyze" title="Link to this heading">#</a></h2>
<p>This module follows TinyTorchs <strong>Build → Use → Analyze</strong> framework:</p>
<ol class="arabic simple">
<li><p><strong>Build</strong>: Implement benchmarking framework with statistical testing (confidence intervals, t-tests), normalized metrics (speedup, compression, efficiency), warmup protocols, and automated report generation</p></li>
<li><p><strong>Use</strong>: Benchmark all your Optimization Tier implementations (profiling, quantization, compression, memoization, acceleration) against baselines on real tasks; compare fairly with statistical rigor</p></li>
<li><p><strong>Analyze</strong>: Why do benchmark results vary across runs? How does hardware affect comparison fairness? When is 5% speedup statistically significant vs noise? What makes benchmarks representative vs over-fitted?</p></li>
</ol>
</section>
<section id="implementation-guide">
<h2>Implementation Guide<a class="headerlink" href="#implementation-guide" title="Link to this heading">#</a></h2>
<section id="core-benchmarking-components">
<h3>Core Benchmarking Components<a class="headerlink" href="#core-benchmarking-components" title="Link to this heading">#</a></h3>
<p>Your benchmarking framework implements four key systems:</p>
<section id="statistical-measurement-infrastructure">
<h4>1. Statistical Measurement Infrastructure<a class="headerlink" href="#statistical-measurement-infrastructure" title="Link to this heading">#</a></h4>
<p><strong>Why Multiple Runs Matter</strong></p>
<p>Single measurements are meaningless in ML systems. Performance varies 10-30% across runs due to:</p>
<ul class="simple">
<li><p><strong>Thermal throttling</strong>: CPU frequency drops when hot</p></li>
<li><p><strong>OS background tasks</strong>: Interrupts, garbage collection, other processes</p></li>
<li><p><strong>Memory state</strong>: Cache coldness, fragmentation, swap pressure</p></li>
<li><p><strong>CPU frequency scaling</strong>: Dynamic frequency adjustment</p></li>
</ul>
<p><strong>Statistical Solution</strong></p>
<div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="k">class</span><span class="w"> </span><span class="nc">BenchmarkResult</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;Container for measurements with statistical analysis.&quot;&quot;&quot;</span>
<span class="k">def</span><span class="w"> </span><span class="fm">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">metric_name</span><span class="p">:</span> <span class="nb">str</span><span class="p">,</span> <span class="n">values</span><span class="p">:</span> <span class="n">List</span><span class="p">[</span><span class="nb">float</span><span class="p">]):</span>
<span class="bp">self</span><span class="o">.</span><span class="n">mean</span> <span class="o">=</span> <span class="n">statistics</span><span class="o">.</span><span class="n">mean</span><span class="p">(</span><span class="n">values</span><span class="p">)</span>
<span class="bp">self</span><span class="o">.</span><span class="n">std</span> <span class="o">=</span> <span class="n">statistics</span><span class="o">.</span><span class="n">stdev</span><span class="p">(</span><span class="n">values</span><span class="p">)</span>
<span class="bp">self</span><span class="o">.</span><span class="n">median</span> <span class="o">=</span> <span class="n">statistics</span><span class="o">.</span><span class="n">median</span><span class="p">(</span><span class="n">values</span><span class="p">)</span>
<span class="c1"># 95% confidence interval for the mean</span>
<span class="n">t_score</span> <span class="o">=</span> <span class="mf">1.96</span> <span class="c1"># Normal approximation</span>
<span class="n">margin</span> <span class="o">=</span> <span class="n">t_score</span> <span class="o">*</span> <span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">std</span> <span class="o">/</span> <span class="n">np</span><span class="o">.</span><span class="n">sqrt</span><span class="p">(</span><span class="nb">len</span><span class="p">(</span><span class="n">values</span><span class="p">)))</span>
<span class="bp">self</span><span class="o">.</span><span class="n">ci_lower</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">mean</span> <span class="o">-</span> <span class="n">margin</span>
<span class="bp">self</span><span class="o">.</span><span class="n">ci_upper</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">mean</span> <span class="o">+</span> <span class="n">margin</span>
</pre></div>
</div>
<p><strong>What This Reveals</strong>: If confidence intervals overlap between baseline and optimized, the difference might be noise. Statistical rigor prevents false claims.</p>
</section>
<section id="warmup-and-measurement-protocol">
<h4>2. Warmup and Measurement Protocol<a class="headerlink" href="#warmup-and-measurement-protocol" title="Link to this heading">#</a></h4>
<p><strong>The Warmup Problem</strong></p>
<p>First run: 120ms. Second run: 100ms. Third run: 98ms. What happened?</p>
<ul class="simple">
<li><p><strong>Cold cache</strong>: First run pays cache miss penalties</p></li>
<li><p><strong>JIT compilation</strong>: NumPy and frameworks compile code paths on first use</p></li>
<li><p><strong>Memory allocation</strong>: Initial runs establish memory patterns</p></li>
</ul>
<p><strong>Warmup Solution</strong></p>
<div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="k">class</span><span class="w"> </span><span class="nc">Benchmark</span><span class="p">:</span>
<span class="k">def</span><span class="w"> </span><span class="fm">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">warmup_runs</span><span class="o">=</span><span class="mi">5</span><span class="p">,</span> <span class="n">measurement_runs</span><span class="o">=</span><span class="mi">10</span><span class="p">):</span>
<span class="bp">self</span><span class="o">.</span><span class="n">warmup_runs</span> <span class="o">=</span> <span class="n">warmup_runs</span>
<span class="bp">self</span><span class="o">.</span><span class="n">measurement_runs</span> <span class="o">=</span> <span class="n">measurement_runs</span>
<span class="k">def</span><span class="w"> </span><span class="nf">run_latency_benchmark</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">model</span><span class="p">,</span> <span class="n">input_data</span><span class="p">):</span>
<span class="c1"># Warmup: stabilize performance</span>
<span class="k">for</span> <span class="n">_</span> <span class="ow">in</span> <span class="nb">range</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">warmup_runs</span><span class="p">):</span>
<span class="n">model</span><span class="o">.</span><span class="n">forward</span><span class="p">(</span><span class="n">input_data</span><span class="p">)</span>
<span class="c1"># Measurement: collect statistics</span>
<span class="n">latencies</span> <span class="o">=</span> <span class="p">[]</span>
<span class="k">for</span> <span class="n">_</span> <span class="ow">in</span> <span class="nb">range</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">measurement_runs</span><span class="p">):</span>
<span class="n">start</span> <span class="o">=</span> <span class="n">time</span><span class="o">.</span><span class="n">perf_counter</span><span class="p">()</span>
<span class="n">model</span><span class="o">.</span><span class="n">forward</span><span class="p">(</span><span class="n">input_data</span><span class="p">)</span>
<span class="n">latencies</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">time</span><span class="o">.</span><span class="n">perf_counter</span><span class="p">()</span> <span class="o">-</span> <span class="n">start</span><span class="p">)</span>
<span class="k">return</span> <span class="n">BenchmarkResult</span><span class="p">(</span><span class="s2">&quot;latency_ms&quot;</span><span class="p">,</span> <span class="n">latencies</span><span class="p">)</span>
</pre></div>
</div>
<p><strong>Why This Matters</strong>: Warmup runs discard cold-start effects. Measurement runs capture true steady-state performance.</p>
</section>
<section id="normalized-metrics-for-fair-comparison">
<h4>3. Normalized Metrics for Fair Comparison<a class="headerlink" href="#normalized-metrics-for-fair-comparison" title="Link to this heading">#</a></h4>
<p><strong>Hardware-Independent Speedup</strong></p>
<div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="c1"># Speedup ratio: baseline_time / optimized_time</span>
<span class="n">speedup</span> <span class="o">=</span> <span class="n">baseline_result</span><span class="o">.</span><span class="n">mean</span> <span class="o">/</span> <span class="n">optimized_result</span><span class="o">.</span><span class="n">mean</span>
<span class="c1"># Example: 100ms / 80ms = 1.25x speedup (25% faster)</span>
<span class="c1"># Speedup &gt; 1.0 means optimization helped</span>
<span class="c1"># Speedup &lt; 1.0 means optimization regressed</span>
</pre></div>
</div>
<p><strong>Compression Ratio</strong></p>
<div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="c1"># Model size reduction</span>
<span class="n">compression_ratio</span> <span class="o">=</span> <span class="n">original_size_mb</span> <span class="o">/</span> <span class="n">compressed_size_mb</span>
<span class="c1"># Example: 100MB / 25MB = 4x compression</span>
</pre></div>
</div>
<p><strong>Efficiency Score (Multi-Objective)</strong></p>
<div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="c1"># Combine speed + size + accuracy</span>
<span class="n">efficiency</span> <span class="o">=</span> <span class="p">(</span><span class="n">speedup</span> <span class="o">*</span> <span class="n">compression</span><span class="p">)</span> <span class="o">/</span> <span class="p">(</span><span class="mi">1</span> <span class="o">+</span> <span class="nb">abs</span><span class="p">(</span><span class="n">accuracy_delta</span><span class="p">))</span>
<span class="c1"># Penalizes accuracy loss</span>
<span class="c1"># Rewards speed AND compression</span>
<span class="c1"># Single metric for ranking</span>
</pre></div>
</div>
<p><strong>Why Normalized Metrics</strong>: Speedup ratios work on any hardware. “2x faster” is meaningful whether you have M1 Mac or Intel i9. Absolute times (100ms → 50ms) are hardware-specific.</p>
</section>
<section id="comprehensive-benchmark-suite">
<h4>4. Comprehensive Benchmark Suite<a class="headerlink" href="#comprehensive-benchmark-suite" title="Link to this heading">#</a></h4>
<p><strong>Multiple Benchmark Types</strong></p>
<p>Your <code class="docutils literal notranslate"><span class="pre">BenchmarkSuite</span></code> runs:</p>
<ol class="arabic simple">
<li><p><strong>Latency Benchmark</strong>: How fast is inference? (milliseconds)</p></li>
<li><p><strong>Accuracy Benchmark</strong>: How correct are predictions? (0.0-1.0)</p></li>
<li><p><strong>Memory Benchmark</strong>: How much RAM is used? (megabytes)</p></li>
<li><p><strong>Energy Benchmark</strong>: How efficient is compute? (estimated joules)</p></li>
</ol>
<p><strong>Pareto Frontier Analysis</strong></p>
<div class="highlight-default notranslate"><div class="highlight"><pre><span></span>Accuracy
| A ● ← Model A: High accuracy, high latency
|
| B ● ← Model B: Balanced (Pareto optimal)
|
| C ●← Model C: Low accuracy, low latency
|__________→ Latency (lower is better)
</pre></div>
</div>
<p>Models on the Pareto frontier arent strictly dominated—each represents a valid optimization trade-off. Your suite automatically identifies these optimal points.</p>
</section>
</section>
<section id="real-world-benchmarking-principles">
<h3>Real-World Benchmarking Principles<a class="headerlink" href="#real-world-benchmarking-principles" title="Link to this heading">#</a></h3>
<p>Your implementation teaches industry-standard methodology:</p>
<section id="reproducibility-requirements">
<h4>Reproducibility Requirements<a class="headerlink" href="#reproducibility-requirements" title="Link to this heading">#</a></h4>
<p>Every benchmark run documents:</p>
<div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="n">system_info</span> <span class="o">=</span> <span class="p">{</span>
<span class="s1">&#39;platform&#39;</span><span class="p">:</span> <span class="s1">&#39;macOS-14.2-arm64&#39;</span><span class="p">,</span> <span class="c1"># OS version</span>
<span class="s1">&#39;processor&#39;</span><span class="p">:</span> <span class="s1">&#39;Apple M1 Max&#39;</span><span class="p">,</span> <span class="c1"># CPU type</span>
<span class="s1">&#39;python_version&#39;</span><span class="p">:</span> <span class="s1">&#39;3.11.6&#39;</span><span class="p">,</span> <span class="c1"># Runtime</span>
<span class="s1">&#39;memory_gb&#39;</span><span class="p">:</span> <span class="mi">64</span><span class="p">,</span> <span class="c1"># RAM</span>
<span class="s1">&#39;cpu_count&#39;</span><span class="p">:</span> <span class="mi">10</span> <span class="c1"># Cores</span>
<span class="p">}</span>
</pre></div>
</div>
<p><strong>Why</strong>: Colleague should reproduce your results given same environment. Missing details make verification impossible.</p>
</section>
<section id="fair-comparison-protocol">
<h4>Fair Comparison Protocol<a class="headerlink" href="#fair-comparison-protocol" title="Link to this heading">#</a></h4>
<p><strong>Dont Compare</strong>:</p>
<ul class="simple">
<li><p>GPU-optimized code vs CPU baseline (unfair hardware)</p></li>
<li><p>Quantized INT8 vs FP32 baseline (unfair precision)</p></li>
<li><p>Batch size 32 vs batch size 1 (unfair workload)</p></li>
<li><p>Cold start vs warmed up (unfair cache state)</p></li>
</ul>
<p><strong>Do Compare</strong>:</p>
<ul class="simple">
<li><p>Same hardware, same workload, same environment</p></li>
<li><p>Baseline vs optimized on identical conditions</p></li>
<li><p>Report speedup with confidence intervals</p></li>
<li><p>Test statistical significance (t-test, p &lt; 0.05)</p></li>
</ul>
</section>
<section id="statistical-significance-testing">
<h4>Statistical Significance Testing<a class="headerlink" href="#statistical-significance-testing" title="Link to this heading">#</a></h4>
<div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="kn">from</span><span class="w"> </span><span class="nn">scipy</span><span class="w"> </span><span class="kn">import</span> <span class="n">stats</span>
<span class="n">baseline_times</span> <span class="o">=</span> <span class="p">[</span><span class="mi">100</span><span class="p">,</span> <span class="mi">102</span><span class="p">,</span> <span class="mi">98</span><span class="p">,</span> <span class="mi">101</span><span class="p">,</span> <span class="mi">99</span><span class="p">]</span> <span class="c1"># ms</span>
<span class="n">optimized_times</span> <span class="o">=</span> <span class="p">[</span><span class="mi">95</span><span class="p">,</span> <span class="mi">97</span><span class="p">,</span> <span class="mi">93</span><span class="p">,</span> <span class="mi">96</span><span class="p">,</span> <span class="mi">94</span><span class="p">]</span>
<span class="c1"># Is the difference real or noise?</span>
<span class="n">t_stat</span><span class="p">,</span> <span class="n">p_value</span> <span class="o">=</span> <span class="n">stats</span><span class="o">.</span><span class="n">ttest_ind</span><span class="p">(</span><span class="n">baseline_times</span><span class="p">,</span> <span class="n">optimized_times</span><span class="p">)</span>
<span class="k">if</span> <span class="n">p_value</span> <span class="o">&lt;</span> <span class="mf">0.05</span><span class="p">:</span>
<span class="nb">print</span><span class="p">(</span><span class="s2">&quot;Statistically significant (p &lt; 0.05)&quot;</span><span class="p">)</span>
<span class="k">else</span><span class="p">:</span>
<span class="nb">print</span><span class="p">(</span><span class="s2">&quot;Not significant—could be noise&quot;</span><span class="p">)</span>
</pre></div>
</div>
<p><strong>Why This Matters</strong>: 5% speedup with p=0.08 isnt significant. Could be measurement variance. Production teams dont merge optimizations without statistical confidence.</p>
</section>
</section>
<section id="connection-to-competition-workflow-module-20">
<h3>Connection to Competition Workflow (Module 20)<a class="headerlink" href="#connection-to-competition-workflow-module-20" title="Link to this heading">#</a></h3>
<p>This benchmarking infrastructure provides the measurement harness used in Module 20s competition workflow:</p>
<p><strong>How Module 20 Uses This Framework</strong></p>
<ol class="arabic simple">
<li><p>Module 20 uses your <code class="docutils literal notranslate"><span class="pre">Benchmark</span></code> class to measure baseline and optimized performance</p></li>
<li><p>Statistical rigor from this module ensures fair comparison across submissions</p></li>
<li><p>Normalized metrics enable hardware-independent ranking</p></li>
<li><p>Reproducible protocols ensure all competitors use the same measurement methodology</p></li>
</ol>
<p><strong>The Workflow</strong></p>
<ol class="arabic simple">
<li><p>Module 19: Learn benchmarking methodology (statistical rigor, fair comparison)</p></li>
<li><p>Module 20: Apply benchmarking tools in competition workflow (submission generation, validation)</p></li>
<li><p>Competition: Use Benchmark harness to measure and validate optimizations</p></li>
</ol>
<p>Your benchmarking framework provides the foundation for fair competition—same measurement methodology, same statistical analysis, same reporting format. Module 20 teaches how to use these tools in a competition context.</p>
</section>
</section>
<section id="getting-started">
<h2>Getting Started<a class="headerlink" href="#getting-started" title="Link to this heading">#</a></h2>
<section id="prerequisites">
<h3>Prerequisites<a class="headerlink" href="#prerequisites" title="Link to this heading">#</a></h3>
<p>Ensure you understand the optimization foundations:</p>
<div class="highlight-bash notranslate"><div class="highlight"><pre><span></span><span class="c1"># Activate TinyTorch environment</span>
<span class="nb">source</span><span class="w"> </span>scripts/activate-tinytorch
<span class="c1"># Verify prerequisite modules</span>
tito<span class="w"> </span><span class="nb">test</span><span class="w"> </span>profiling
tito<span class="w"> </span><span class="nb">test</span><span class="w"> </span>quantization
tito<span class="w"> </span><span class="nb">test</span><span class="w"> </span>compression
</pre></div>
</div>
</section>
<section id="development-workflow">
<h3>Development Workflow<a class="headerlink" href="#development-workflow" title="Link to this heading">#</a></h3>
<ol class="arabic simple">
<li><p><strong>Open the development file</strong>: <code class="docutils literal notranslate"><span class="pre">modules/19_benchmarking/benchmarking_dev.py</span></code></p></li>
<li><p><strong>Implement BenchmarkResult</strong>: Container for measurements with statistical analysis</p></li>
<li><p><strong>Build Benchmark class</strong>: Runner with warmup, multiple runs, metrics collection</p></li>
<li><p><strong>Create BenchmarkSuite</strong>: Full evaluation with latency/accuracy/memory/energy</p></li>
<li><p><strong>Add reporting</strong>: Automated report generation with visualizations</p></li>
<li><p><strong>Export and verify</strong>: <code class="docutils literal notranslate"><span class="pre">tito</span> <span class="pre">module</span> <span class="pre">complete</span> <span class="pre">19</span> <span class="pre">&amp;&amp;</span> <span class="pre">tito</span> <span class="pre">test</span> <span class="pre">benchmarking</span></code></p></li>
</ol>
</section>
</section>
<section id="testing">
<h2>Testing<a class="headerlink" href="#testing" title="Link to this heading">#</a></h2>
<section id="comprehensive-test-suite">
<h3>Comprehensive Test Suite<a class="headerlink" href="#comprehensive-test-suite" title="Link to this heading">#</a></h3>
<p>Run the full test suite to verify benchmarking functionality:</p>
<div class="highlight-bash notranslate"><div class="highlight"><pre><span></span><span class="c1"># TinyTorch CLI (recommended)</span>
tito<span class="w"> </span><span class="nb">test</span><span class="w"> </span>benchmarking
<span class="c1"># Direct pytest execution</span>
python<span class="w"> </span>-m<span class="w"> </span>pytest<span class="w"> </span>tests/<span class="w"> </span>-k<span class="w"> </span>benchmarking<span class="w"> </span>-v
</pre></div>
</div>
</section>
<section id="test-coverage-areas">
<h3>Test Coverage Areas<a class="headerlink" href="#test-coverage-areas" title="Link to this heading">#</a></h3>
<ul class="simple">
<li><p><strong>Statistical Calculations</strong>: Mean, std, median, confidence intervals computed correctly</p></li>
<li><p><strong>Multiple Runs</strong>: Warmup and measurement phases work properly</p></li>
<li><p><strong>Normalized Metrics</strong>: Speedup, compression, efficiency calculated accurately</p></li>
<li><p><strong>Fair Comparison</strong>: Same workload enforcement, baseline vs optimized</p></li>
<li><p><strong>Result Serialization</strong>: BenchmarkResult converts to dict for storage</p></li>
<li><p><strong>Visualization</strong>: Plots generate with proper formatting and error bars</p></li>
<li><p><strong>System Info</strong>: Metadata captured for reproducibility</p></li>
<li><p><strong>Pareto Analysis</strong>: Optimal trade-off points identified correctly</p></li>
</ul>
</section>
<section id="inline-testing-validation">
<h3>Inline Testing &amp; Validation<a class="headerlink" href="#inline-testing-validation" title="Link to this heading">#</a></h3>
<p>The module includes comprehensive unit tests:</p>
<div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="err">🔬</span> <span class="n">Unit</span> <span class="n">Test</span><span class="p">:</span> <span class="n">BenchmarkResult</span><span class="o">...</span>
<span class="err"></span> <span class="n">Mean</span> <span class="n">calculation</span> <span class="n">correct</span><span class="p">:</span> <span class="mf">3.0</span>
<span class="err"></span> <span class="n">Std</span> <span class="n">calculation</span> <span class="n">matches</span> <span class="n">statistics</span> <span class="n">module</span>
<span class="err"></span> <span class="n">Confidence</span> <span class="n">intervals</span> <span class="n">bound</span> <span class="n">mean</span>
<span class="err"></span> <span class="n">Serialization</span> <span class="n">preserves</span> <span class="n">data</span>
<span class="err">📈</span> <span class="n">Progress</span><span class="p">:</span> <span class="n">BenchmarkResult</span> <span class="err"></span>
<span class="err">🔬</span> <span class="n">Unit</span> <span class="n">Test</span><span class="p">:</span> <span class="n">Benchmark</span> <span class="n">latency</span><span class="o">...</span>
<span class="err"></span> <span class="n">Warmup</span> <span class="n">runs</span> <span class="n">executed</span> <span class="n">before</span> <span class="n">measurement</span>
<span class="err"></span> <span class="n">Multiple</span> <span class="n">measurement</span> <span class="n">runs</span> <span class="n">collected</span>
<span class="err"></span> <span class="n">Results</span> <span class="n">include</span> <span class="n">mean</span> <span class="err">±</span> <span class="n">CI</span>
<span class="err">📈</span> <span class="n">Progress</span><span class="p">:</span> <span class="n">Benchmark</span> <span class="err"></span>
<span class="err">🔬</span> <span class="n">Unit</span> <span class="n">Test</span><span class="p">:</span> <span class="n">BenchmarkSuite</span><span class="o">...</span>
<span class="err"></span> <span class="n">All</span> <span class="n">benchmark</span> <span class="n">types</span> <span class="n">run</span> <span class="p">(</span><span class="n">latency</span><span class="p">,</span> <span class="n">accuracy</span><span class="p">,</span> <span class="n">memory</span><span class="p">,</span> <span class="n">energy</span><span class="p">)</span>
<span class="err"></span> <span class="n">Results</span> <span class="n">organized</span> <span class="n">by</span> <span class="n">metric</span> <span class="nb">type</span>
<span class="err"></span> <span class="n">Visualizations</span> <span class="n">generated</span>
<span class="err">📈</span> <span class="n">Progress</span><span class="p">:</span> <span class="n">BenchmarkSuite</span> <span class="err"></span>
</pre></div>
</div>
</section>
<section id="manual-testing-examples">
<h3>Manual Testing Examples<a class="headerlink" href="#manual-testing-examples" title="Link to this heading">#</a></h3>
<div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="kn">from</span><span class="w"> </span><span class="nn">tinytorch.benchmarking.benchmark</span><span class="w"> </span><span class="kn">import</span> <span class="n">Benchmark</span><span class="p">,</span> <span class="n">BenchmarkSuite</span>
<span class="kn">from</span><span class="w"> </span><span class="nn">tinytorch.core.tensor</span><span class="w"> </span><span class="kn">import</span> <span class="n">Tensor</span>
<span class="kn">import</span><span class="w"> </span><span class="nn">numpy</span><span class="w"> </span><span class="k">as</span><span class="w"> </span><span class="nn">np</span>
<span class="c1"># Create simple models for testing</span>
<span class="k">class</span><span class="w"> </span><span class="nc">FastModel</span><span class="p">:</span>
<span class="n">name</span> <span class="o">=</span> <span class="s2">&quot;fast_model&quot;</span>
<span class="k">def</span><span class="w"> </span><span class="nf">forward</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">x</span><span class="p">):</span>
<span class="k">return</span> <span class="n">x</span> <span class="o">*</span> <span class="mi">2</span>
<span class="k">class</span><span class="w"> </span><span class="nc">SlowModel</span><span class="p">:</span>
<span class="n">name</span> <span class="o">=</span> <span class="s2">&quot;slow_model&quot;</span>
<span class="k">def</span><span class="w"> </span><span class="nf">forward</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">x</span><span class="p">):</span>
<span class="kn">import</span><span class="w"> </span><span class="nn">time</span>
<span class="n">time</span><span class="o">.</span><span class="n">sleep</span><span class="p">(</span><span class="mf">0.01</span><span class="p">)</span> <span class="c1"># Simulate 10ms latency</span>
<span class="k">return</span> <span class="n">x</span> <span class="o">*</span> <span class="mi">2</span>
<span class="c1"># Benchmark comparison</span>
<span class="n">models</span> <span class="o">=</span> <span class="p">[</span><span class="n">FastModel</span><span class="p">(),</span> <span class="n">SlowModel</span><span class="p">()]</span>
<span class="n">benchmark</span> <span class="o">=</span> <span class="n">Benchmark</span><span class="p">(</span><span class="n">models</span><span class="p">,</span> <span class="n">datasets</span><span class="o">=</span><span class="p">[</span><span class="kc">None</span><span class="p">])</span>
<span class="c1"># Run latency benchmark</span>
<span class="n">results</span> <span class="o">=</span> <span class="n">benchmark</span><span class="o">.</span><span class="n">run_latency_benchmark</span><span class="p">()</span>
<span class="k">for</span> <span class="n">model_name</span><span class="p">,</span> <span class="n">result</span> <span class="ow">in</span> <span class="n">results</span><span class="o">.</span><span class="n">items</span><span class="p">():</span>
<span class="nb">print</span><span class="p">(</span><span class="sa">f</span><span class="s2">&quot;</span><span class="si">{</span><span class="n">model_name</span><span class="si">}</span><span class="s2">: </span><span class="si">{</span><span class="n">result</span><span class="o">.</span><span class="n">mean</span><span class="si">:</span><span class="s2">.2f</span><span class="si">}</span><span class="s2"> ± </span><span class="si">{</span><span class="n">result</span><span class="o">.</span><span class="n">std</span><span class="si">:</span><span class="s2">.2f</span><span class="si">}</span><span class="s2">ms&quot;</span><span class="p">)</span>
<span class="nb">print</span><span class="p">(</span><span class="sa">f</span><span class="s2">&quot; 95% CI: [</span><span class="si">{</span><span class="n">result</span><span class="o">.</span><span class="n">ci_lower</span><span class="si">:</span><span class="s2">.2f</span><span class="si">}</span><span class="s2">, </span><span class="si">{</span><span class="n">result</span><span class="o">.</span><span class="n">ci_upper</span><span class="si">:</span><span class="s2">.2f</span><span class="si">}</span><span class="s2">]&quot;</span><span class="p">)</span>
<span class="c1"># Speedup calculation</span>
<span class="n">fast_time</span> <span class="o">=</span> <span class="n">results</span><span class="p">[</span><span class="s1">&#39;fast_model&#39;</span><span class="p">]</span><span class="o">.</span><span class="n">mean</span>
<span class="n">slow_time</span> <span class="o">=</span> <span class="n">results</span><span class="p">[</span><span class="s1">&#39;slow_model&#39;</span><span class="p">]</span><span class="o">.</span><span class="n">mean</span>
<span class="n">speedup</span> <span class="o">=</span> <span class="n">slow_time</span> <span class="o">/</span> <span class="n">fast_time</span>
<span class="nb">print</span><span class="p">(</span><span class="sa">f</span><span class="s2">&quot;</span><span class="se">\n</span><span class="s2">Speedup: </span><span class="si">{</span><span class="n">speedup</span><span class="si">:</span><span class="s2">.2f</span><span class="si">}</span><span class="s2">x&quot;</span><span class="p">)</span>
</pre></div>
</div>
</section>
</section>
<section id="systems-thinking-questions">
<h2>Systems Thinking Questions<a class="headerlink" href="#systems-thinking-questions" title="Link to this heading">#</a></h2>
<section id="real-world-applications">
<h3>Real-World Applications<a class="headerlink" href="#real-world-applications" title="Link to this heading">#</a></h3>
<ul class="simple">
<li><p><strong>Production ML Deployment</strong>: PyTorch runs continuous benchmarking before merging optimizations—statistical rigor prevents performance regressions</p></li>
<li><p><strong>Hardware Evaluation</strong>: Googles TPU teams benchmark every architecture iteration—measurements justify billion-dollar hardware investments</p></li>
<li><p><strong>Model Optimization</strong>: Meta benchmarks training efficiency (samples/sec, memory, convergence)—10% speedup saves hundreds of thousands in compute costs</p></li>
<li><p><strong>Research Validation</strong>: Papers require reproducible benchmarks with statistical significance—ablation studies need fair comparison protocols</p></li>
</ul>
</section>
<section id="statistical-foundations">
<h3>Statistical Foundations<a class="headerlink" href="#statistical-foundations" title="Link to this heading">#</a></h3>
<ul class="simple">
<li><p><strong>Central Limit Theorem</strong>: Multiple measurements → normal distribution → confidence intervals and significance testing</p></li>
<li><p><strong>Measurement Uncertainty</strong>: Every measurement has variance—systematic errors (timer overhead) and random errors (thermal noise)</p></li>
<li><p><strong>Statistical Power</strong>: How many runs needed for significance? Depends on effect size and variance—5% speedup requires more runs than 50%</p></li>
<li><p><strong>Type I/II Errors</strong>: False positive (claiming speedup when its noise) vs false negative (missing real speedup due to insufficient samples)</p></li>
</ul>
</section>
<section id="performance-characteristics">
<h3>Performance Characteristics<a class="headerlink" href="#performance-characteristics" title="Link to this heading">#</a></h3>
<ul class="simple">
<li><p><strong>Warmup Effects</strong>: First run 20% slower than steady-state—cold cache, JIT compilation, memory allocation</p></li>
<li><p><strong>System Noise Sources</strong>: Thermal throttling (CPU frequency drops), OS interrupts (background tasks), memory pressure (GC pauses), network interference</p></li>
<li><p><strong>Observer Effect</strong>: Instrumentation changes behavior—profiling overhead 5%, cache effects from measurement code, branch prediction altered</p></li>
<li><p><strong>Hardware Variability</strong>: Optimization 3x faster on GPU but 1.1x on CPU—memory bandwidth helps GPU, CPU cache doesnt fit data</p></li>
</ul>
</section>
</section>
<section id="ready-to-build">
<h2>Ready to Build?<a class="headerlink" href="#ready-to-build" title="Link to this heading">#</a></h2>
<p>Youve reached the penultimate module of the Optimization Tier. This benchmarking framework validates all your previous work from Modules 14-18, transforming subjective claims (“feels faster”) into objective data (“1.8x speedup, p &lt; 0.01, 95% CI [1.6x, 2.0x]”).</p>
<p>Your benchmarking infrastructure provides the measurement foundation for Module 20s competition workflow, where youll use these tools to validate optimizations systematically. Fair measurement methodology ensures your innovation is recognized—not who got lucky with thermal throttling.</p>
<p>Module 20 teaches how to use your benchmarking framework in a competition context—generating submissions, validating constraints, and packaging results. Your benchmarking framework measures cumulative impact with statistical rigor. This is how production ML teams validate optimizations before deployment—rigorous measurement prevents regressions and quantifies improvements.</p>
<p>Statistical rigor isnt just academic formality—its engineering discipline. When Meta claims 10% training speedup saves hundreds of thousands in compute costs, that claim requires measurements with confidence intervals and significance testing. Your framework implements this methodology from first principles.</p>
<p>Choose your preferred way to engage with this module:</p>
<div class="sd-container-fluid sd-sphinx-override sd-mb-4 docutils">
<div class="sd-row sd-row-cols-1 sd-row-cols-xs-1 sd-row-cols-sm-2 sd-row-cols-md-3 sd-row-cols-lg-3 docutils">
<div class="sd-col sd-d-flex-row docutils">
<div class="sd-card sd-sphinx-override sd-w-100 sd-shadow-sm sd-card-hover docutils">
<div class="sd-card-body docutils">
<div class="sd-card-title sd-font-weight-bold docutils">
Launch Binder</div>
<p class="sd-card-text">Run this module interactively in your browser. No installation required.</p>
</div>
<a class="sd-stretched-link sd-hide-link-text reference external" href="https://mybinder.org/v2/gh/mlsysbook/TinyTorch/main?filepath=modules/19_benchmarking/benchmarking_dev.ipynb"><span>https://mybinder.org/v2/gh/mlsysbook/TinyTorch/main?filepath=modules/19_benchmarking/benchmarking_dev.ipynb</span></a></div>
</div>
<div class="sd-col sd-d-flex-row docutils">
<div class="sd-card sd-sphinx-override sd-w-100 sd-shadow-sm sd-card-hover docutils">
<div class="sd-card-body docutils">
<div class="sd-card-title sd-font-weight-bold docutils">
Open in Colab</div>
<p class="sd-card-text">Use Google Colab for GPU access and cloud compute power.</p>
</div>
<a class="sd-stretched-link sd-hide-link-text reference external" href="https://colab.research.google.com/github/mlsysbook/TinyTorch/blob/main/modules/19_benchmarking/benchmarking_dev.ipynb"><span>https://colab.research.google.com/github/mlsysbook/TinyTorch/blob/main/modules/19_benchmarking/benchmarking_dev.ipynb</span></a></div>
</div>
<div class="sd-col sd-d-flex-row docutils">
<div class="sd-card sd-sphinx-override sd-w-100 sd-shadow-sm sd-card-hover docutils">
<div class="sd-card-body docutils">
<div class="sd-card-title sd-font-weight-bold docutils">
View Source</div>
<p class="sd-card-text">Browse the Python source code and understand the implementation.</p>
</div>
<a class="sd-stretched-link sd-hide-link-text reference external" href="https://github.com/mlsysbook/TinyTorch/blob/main/modules/19_benchmarking/benchmarking_dev.py"><span>https://github.com/mlsysbook/TinyTorch/blob/main/modules/19_benchmarking/benchmarking_dev.py</span></a></div>
</div>
</div>
</div>
<div class="tip admonition">
<p class="admonition-title">Save Your Progress</p>
<p>Binder sessions are temporary. Download your completed notebook when done, or switch to local development for persistent work.</p>
</div>
<hr class="docutils" />
<div class="prev-next-area">
<a class="left-prev" href="../modules/18_acceleration_ABOUT.html" title="previous page">Previous Module</a>
<a class="right-next" href="../modules/20_capstone_ABOUT.html" title="next page">Next Module</a>
</div>
</section>
</section>
<script type="text/x-thebe-config">
{
requestKernel: true,
binderOptions: {
repo: "binder-examples/jupyter-stacks-datascience",
ref: "master",
},
codeMirrorConfig: {
theme: "abcdef",
mode: "python"
},
kernelOptions: {
name: "python3",
path: "./modules"
},
predefinedOutput: true
}
</script>
<script>kernelName = 'python3'</script>
</article>
<footer class="prev-next-footer d-print-none">
<div class="prev-next-area">
<a class="left-prev"
href="18_acceleration_ABOUT.html"
title="previous page">
<i class="fa-solid fa-angle-left"></i>
<div class="prev-next-info">
<p class="prev-next-subtitle">previous</p>
<p class="prev-next-title">18. Acceleration - CPU Vectorization &amp; Cache Optimization</p>
</div>
</a>
<a class="right-next"
href="../tiers/olympics.html"
title="next page">
<div class="prev-next-info">
<p class="prev-next-subtitle">next</p>
<p class="prev-next-title">🏅 Torch Olympics (Module 20)</p>
</div>
<i class="fa-solid fa-angle-right"></i>
</a>
</div>
</footer>
</div>
<div class="bd-sidebar-secondary bd-toc"><div class="sidebar-secondary-items sidebar-secondary__inner">
<div class="sidebar-secondary-item">
<div class="page-toc tocsection onthispage">
<i class="fa-solid fa-list"></i> Contents
</div>
<nav class="bd-toc-nav page-toc">
<ul class="visible nav section-nav flex-column">
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#overview">Overview</a></li>
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#learning-objectives">Learning Objectives</a></li>
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#build-use-analyze">Build → Use → Analyze</a></li>
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#implementation-guide">Implementation Guide</a><ul class="nav section-nav flex-column">
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#core-benchmarking-components">Core Benchmarking Components</a><ul class="nav section-nav flex-column">
<li class="toc-h4 nav-item toc-entry"><a class="reference internal nav-link" href="#statistical-measurement-infrastructure">1. Statistical Measurement Infrastructure</a></li>
<li class="toc-h4 nav-item toc-entry"><a class="reference internal nav-link" href="#warmup-and-measurement-protocol">2. Warmup and Measurement Protocol</a></li>
<li class="toc-h4 nav-item toc-entry"><a class="reference internal nav-link" href="#normalized-metrics-for-fair-comparison">3. Normalized Metrics for Fair Comparison</a></li>
<li class="toc-h4 nav-item toc-entry"><a class="reference internal nav-link" href="#comprehensive-benchmark-suite">4. Comprehensive Benchmark Suite</a></li>
</ul>
</li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#real-world-benchmarking-principles">Real-World Benchmarking Principles</a><ul class="nav section-nav flex-column">
<li class="toc-h4 nav-item toc-entry"><a class="reference internal nav-link" href="#reproducibility-requirements">Reproducibility Requirements</a></li>
<li class="toc-h4 nav-item toc-entry"><a class="reference internal nav-link" href="#fair-comparison-protocol">Fair Comparison Protocol</a></li>
<li class="toc-h4 nav-item toc-entry"><a class="reference internal nav-link" href="#statistical-significance-testing">Statistical Significance Testing</a></li>
</ul>
</li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#connection-to-competition-workflow-module-20">Connection to Competition Workflow (Module 20)</a></li>
</ul>
</li>
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#getting-started">Getting Started</a><ul class="nav section-nav flex-column">
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#prerequisites">Prerequisites</a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#development-workflow">Development Workflow</a></li>
</ul>
</li>
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#testing">Testing</a><ul class="nav section-nav flex-column">
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#comprehensive-test-suite">Comprehensive Test Suite</a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#test-coverage-areas">Test Coverage Areas</a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#inline-testing-validation">Inline Testing &amp; Validation</a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#manual-testing-examples">Manual Testing Examples</a></li>
</ul>
</li>
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#systems-thinking-questions">Systems Thinking Questions</a><ul class="nav section-nav flex-column">
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#real-world-applications">Real-World Applications</a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#statistical-foundations">Statistical Foundations</a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#performance-characteristics">Performance Characteristics</a></li>
</ul>
</li>
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#ready-to-build">Ready to Build?</a></li>
</ul>
</nav></div>
</div></div>
</div>
<footer class="bd-footer-content">
<div class="bd-footer-content__inner container">
<div class="footer-item">
<p class="component-author">
By Prof. Vijay Janapa Reddi (Harvard University)
</p>
</div>
<div class="footer-item">
<p class="copyright">
© Copyright 2025.
<br/>
</p>
</div>
<div class="footer-item">
</div>
<div class="footer-item">
</div>
</div>
</footer>
</main>
</div>
</div>
<!-- Scripts loaded after <body> so the DOM is not blocked -->
<script src="../_static/scripts/bootstrap.js?digest=dfe6caa3a7d634c4db9b"></script>
<script src="../_static/scripts/pydata-sphinx-theme.js?digest=dfe6caa3a7d634c4db9b"></script>
<footer class="bd-footer">
</footer>
</body>
</html>