Files
TinyTorch/modules/14_profiling_ABOUT.html
2025-12-05 00:52:38 +00:00

1253 lines
102 KiB
HTML
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
<!DOCTYPE html>
<html lang="en" data-content_root="../" >
<head>
<meta charset="utf-8" />
<meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
<title>14. Profiling - Performance Measurement for ML Systems &#8212; Tiny🔥Torch</title>
<script data-cfasync="false">
document.documentElement.dataset.mode = localStorage.getItem("mode") || "";
document.documentElement.dataset.theme = localStorage.getItem("theme") || "";
</script>
<!-- Loaded before other Sphinx assets -->
<link href="../_static/styles/theme.css?digest=dfe6caa3a7d634c4db9b" rel="stylesheet" />
<link href="../_static/styles/bootstrap.css?digest=dfe6caa3a7d634c4db9b" rel="stylesheet" />
<link href="../_static/styles/pydata-sphinx-theme.css?digest=dfe6caa3a7d634c4db9b" rel="stylesheet" />
<link href="../_static/vendor/fontawesome/6.5.2/css/all.min.css?digest=dfe6caa3a7d634c4db9b" rel="stylesheet" />
<link rel="preload" as="font" type="font/woff2" crossorigin href="../_static/vendor/fontawesome/6.5.2/webfonts/fa-solid-900.woff2" />
<link rel="preload" as="font" type="font/woff2" crossorigin href="../_static/vendor/fontawesome/6.5.2/webfonts/fa-brands-400.woff2" />
<link rel="preload" as="font" type="font/woff2" crossorigin href="../_static/vendor/fontawesome/6.5.2/webfonts/fa-regular-400.woff2" />
<link rel="stylesheet" type="text/css" href="../_static/pygments.css?v=03e43079" />
<link rel="stylesheet" type="text/css" href="../_static/styles/sphinx-book-theme.css?v=eba8b062" />
<link rel="stylesheet" type="text/css" href="../_static/togglebutton.css?v=13237357" />
<link rel="stylesheet" type="text/css" href="../_static/copybutton.css?v=76b2166b" />
<link rel="stylesheet" type="text/css" href="../_static/mystnb.8ecb98da25f57f5357bf6f572d296f466b2cfe2517ffebfabe82451661e28f02.css" />
<link rel="stylesheet" type="text/css" href="../_static/sphinx-thebe.css?v=4fa983c6" />
<link rel="stylesheet" type="text/css" href="../_static/sphinx-design.min.css?v=95c83b7e" />
<link rel="stylesheet" type="text/css" href="../_static/custom.css?v=009d37f4" />
<!-- Pre-loaded scripts that we'll load fully later -->
<link rel="preload" as="script" href="../_static/scripts/bootstrap.js?digest=dfe6caa3a7d634c4db9b" />
<link rel="preload" as="script" href="../_static/scripts/pydata-sphinx-theme.js?digest=dfe6caa3a7d634c4db9b" />
<script src="../_static/vendor/fontawesome/6.5.2/js/all.min.js?digest=dfe6caa3a7d634c4db9b"></script>
<script src="../_static/documentation_options.js?v=9eb32ce0"></script>
<script src="../_static/doctools.js?v=9a2dae69"></script>
<script src="../_static/sphinx_highlight.js?v=dc90522c"></script>
<script src="../_static/clipboard.min.js?v=a7894cd8"></script>
<script src="../_static/copybutton.js?v=f281be69"></script>
<script src="../_static/scripts/sphinx-book-theme.js?v=887ef09a"></script>
<script>let toggleHintShow = 'Click to show';</script>
<script>let toggleHintHide = 'Click to hide';</script>
<script>let toggleOpenOnPrint = 'true';</script>
<script src="../_static/togglebutton.js?v=4a39c7ea"></script>
<script>var togglebuttonSelector = '.toggle, .admonition.dropdown';</script>
<script src="../_static/design-tabs.js?v=f930bc37"></script>
<script>const THEBE_JS_URL = "https://unpkg.com/thebe@0.8.2/lib/index.js"; const thebe_selector = ".thebe,.cell"; const thebe_selector_input = "pre"; const thebe_selector_output = ".output, .cell_output"</script>
<script async="async" src="../_static/sphinx-thebe.js?v=c100c467"></script>
<script>var togglebuttonSelector = '.toggle, .admonition.dropdown';</script>
<script>const THEBE_JS_URL = "https://unpkg.com/thebe@0.8.2/lib/index.js"; const thebe_selector = ".thebe,.cell"; const thebe_selector_input = "pre"; const thebe_selector_output = ".output, .cell_output"</script>
<script>DOCUMENTATION_OPTIONS.pagename = 'modules/14_profiling_ABOUT';</script>
<script src="../_static/ml-timeline.js?v=76e9b3e3"></script>
<script src="../_static/wip-banner.js?v=04a7e74d"></script>
<script src="../_static/marimo-badges.js?v=e6289128"></script>
<script src="../_static/sidebar-link.js?v=404b701b"></script>
<script src="../_static/hero-carousel.js?v=10341d2a"></script>
<script src="../_static/subscribe-modal.js?v=42919b64"></script>
<link rel="icon" href="../_static/favicon.svg"/>
<link rel="index" title="Index" href="../genindex.html" />
<link rel="search" title="Search" href="../search.html" />
<link rel="next" title="15. Quantization - Reduced Precision for Efficiency" href="15_quantization_ABOUT.html" />
<link rel="prev" title="⏱️ Optimization Tier (Modules 14-19)" href="../tiers/optimization.html" />
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<meta name="docsearch:language" content="en"/>
</head>
<body data-bs-spy="scroll" data-bs-target=".bd-toc-nav" data-offset="180" data-bs-root-margin="0px 0px -60%" data-default-mode="">
<div id="pst-skip-link" class="skip-link d-print-none"><a href="#main-content">Skip to main content</a></div>
<div id="pst-scroll-pixel-helper"></div>
<button type="button" class="btn rounded-pill" id="pst-back-to-top">
<i class="fa-solid fa-arrow-up"></i>Back to top</button>
<input type="checkbox"
class="sidebar-toggle"
id="pst-primary-sidebar-checkbox"/>
<label class="overlay overlay-primary" for="pst-primary-sidebar-checkbox"></label>
<input type="checkbox"
class="sidebar-toggle"
id="pst-secondary-sidebar-checkbox"/>
<label class="overlay overlay-secondary" for="pst-secondary-sidebar-checkbox"></label>
<div class="search-button__wrapper">
<div class="search-button__overlay"></div>
<div class="search-button__search-container">
<form class="bd-search d-flex align-items-center"
action="../search.html"
method="get">
<i class="fa-solid fa-magnifying-glass"></i>
<input type="search"
class="form-control"
name="q"
id="search-input"
placeholder="Search..."
aria-label="Search..."
autocomplete="off"
autocorrect="off"
autocapitalize="off"
spellcheck="false"/>
<span class="search-button__kbd-shortcut"><kbd class="kbd-shortcut__modifier">Ctrl</kbd>+<kbd>K</kbd></span>
</form></div>
</div>
<div class="pst-async-banner-revealer d-none">
<aside id="bd-header-version-warning" class="d-none d-print-none" aria-label="Version warning"></aside>
</div>
<header class="bd-header navbar navbar-expand-lg bd-navbar d-print-none">
</header>
<div class="bd-container">
<div class="bd-container__inner bd-page-width">
<div class="bd-sidebar-primary bd-sidebar">
<div class="sidebar-header-items sidebar-primary__section">
</div>
<div class="sidebar-primary-items__start sidebar-primary__section">
<div class="sidebar-primary-item">
<a class="navbar-brand logo" href="../intro.html">
<img src="../_static/logo-tinytorch.png" class="logo__image only-light" alt="Tiny🔥Torch - Home"/>
<script>document.write(`<img src="../_static/logo-tinytorch.png" class="logo__image only-dark" alt="Tiny🔥Torch - Home"/>`);</script>
</a></div>
<div class="sidebar-primary-item">
<script>
document.write(`
<button class="btn search-button-field search-button__button" title="Search" aria-label="Search" data-bs-placement="bottom" data-bs-toggle="tooltip">
<i class="fa-solid fa-magnifying-glass"></i>
<span class="search-button__default-text">Search</span>
<span class="search-button__kbd-shortcut"><kbd class="kbd-shortcut__modifier">Ctrl</kbd>+<kbd class="kbd-shortcut__modifier">K</kbd></span>
</button>
`);
</script></div>
<div class="sidebar-primary-item"><nav class="bd-links bd-docs-nav" aria-label="Main">
<div class="bd-toc-item navbar-nav active">
<p aria-level="2" class="caption" role="heading"><span class="caption-text">🚀 Getting Started</span></p>
<ul class="nav bd-sidenav">
<li class="toctree-l1"><a class="reference internal" href="../getting-started.html">Complete Guide</a></li>
</ul>
<p aria-level="2" class="caption" role="heading"><span class="caption-text">🏗 Foundation Tier (01-07)</span></p>
<ul class="nav bd-sidenav">
<li class="toctree-l1"><a class="reference internal" href="../tiers/foundation.html">📖 Tier Overview</a></li>
<li class="toctree-l1"><a class="reference internal" href="01_tensor_ABOUT.html">01. Tensor</a></li>
<li class="toctree-l1"><a class="reference internal" href="02_activations_ABOUT.html">02. Activations</a></li>
<li class="toctree-l1"><a class="reference internal" href="03_layers_ABOUT.html">03. Layers</a></li>
<li class="toctree-l1"><a class="reference internal" href="04_losses_ABOUT.html">04. Losses</a></li>
<li class="toctree-l1"><a class="reference internal" href="05_autograd_ABOUT.html">05. Autograd</a></li>
<li class="toctree-l1"><a class="reference internal" href="06_optimizers_ABOUT.html">06. Optimizers</a></li>
<li class="toctree-l1"><a class="reference internal" href="07_training_ABOUT.html">07. Training</a></li>
</ul>
<p aria-level="2" class="caption" role="heading"><span class="caption-text">🏛️ Architecture Tier (08-13)</span></p>
<ul class="nav bd-sidenav">
<li class="toctree-l1"><a class="reference internal" href="../tiers/architecture.html">📖 Tier Overview</a></li>
<li class="toctree-l1"><a class="reference internal" href="08_dataloader_ABOUT.html">08. DataLoader</a></li>
<li class="toctree-l1"><a class="reference internal" href="09_spatial_ABOUT.html">09. Convolutions</a></li>
<li class="toctree-l1"><a class="reference internal" href="10_tokenization_ABOUT.html">10. Tokenization</a></li>
<li class="toctree-l1"><a class="reference internal" href="11_embeddings_ABOUT.html">11. Embeddings</a></li>
<li class="toctree-l1"><a class="reference internal" href="12_attention_ABOUT.html">12. Attention</a></li>
<li class="toctree-l1"><a class="reference internal" href="13_transformers_ABOUT.html">13. Transformers</a></li>
</ul>
<p aria-level="2" class="caption" role="heading"><span class="caption-text">⏱️ Optimization Tier (14-19)</span></p>
<ul class="current nav bd-sidenav">
<li class="toctree-l1"><a class="reference internal" href="../tiers/optimization.html">📖 Tier Overview</a></li>
<li class="toctree-l1 current active"><a class="current reference internal" href="#">14. Profiling</a></li>
<li class="toctree-l1"><a class="reference internal" href="15_quantization_ABOUT.html">15. Quantization</a></li>
<li class="toctree-l1"><a class="reference internal" href="16_compression_ABOUT.html">16. Compression</a></li>
<li class="toctree-l1"><a class="reference internal" href="17_memoization_ABOUT.html">17. Memoization</a></li>
<li class="toctree-l1"><a class="reference internal" href="18_acceleration_ABOUT.html">18. Acceleration</a></li>
<li class="toctree-l1"><a class="reference internal" href="19_benchmarking_ABOUT.html">19. Benchmarking</a></li>
</ul>
<p aria-level="2" class="caption" role="heading"><span class="caption-text">🏅 Capstone Competition</span></p>
<ul class="nav bd-sidenav">
<li class="toctree-l1"><a class="reference internal" href="../tiers/olympics.html">📖 Competition Overview</a></li>
<li class="toctree-l1"><a class="reference internal" href="20_capstone_ABOUT.html">20. Torch Olympics</a></li>
</ul>
<p aria-level="2" class="caption" role="heading"><span class="caption-text">🧭 Course Orientation</span></p>
<ul class="nav bd-sidenav">
<li class="toctree-l1"><a class="reference internal" href="../chapters/00-introduction.html">Course Structure</a></li>
<li class="toctree-l1"><a class="reference internal" href="../prerequisites.html">Prerequisites &amp; Resources</a></li>
<li class="toctree-l1"><a class="reference internal" href="../chapters/learning-journey.html">Learning Journey</a></li>
<li class="toctree-l1"><a class="reference internal" href="../chapters/milestones.html">Historical Milestones</a></li>
<li class="toctree-l1"><a class="reference internal" href="../faq.html">FAQ</a></li>
</ul>
<p aria-level="2" class="caption" role="heading"><span class="caption-text">🛠️ TITO CLI Reference</span></p>
<ul class="nav bd-sidenav">
<li class="toctree-l1"><a class="reference internal" href="../tito/overview.html">Command Overview</a></li>
<li class="toctree-l1"><a class="reference internal" href="../tito/modules.html">Module Workflow</a></li>
<li class="toctree-l1"><a class="reference internal" href="../tito/milestones.html">Milestone System</a></li>
<li class="toctree-l1"><a class="reference internal" href="../tito/data.html">Progress &amp; Data</a></li>
<li class="toctree-l1"><a class="reference internal" href="../tito/troubleshooting.html">Troubleshooting</a></li>
<li class="toctree-l1"><a class="reference internal" href="../datasets.html">Datasets Guide</a></li>
</ul>
<p aria-level="2" class="caption" role="heading"><span class="caption-text">🤝 Community</span></p>
<ul class="nav bd-sidenav">
<li class="toctree-l1"><a class="reference internal" href="../community.html">Ecosystem</a></li>
<li class="toctree-l1"><a class="reference internal" href="../resources.html">Learning Resources</a></li>
<li class="toctree-l1"><a class="reference internal" href="../credits.html">Credits &amp; Acknowledgments</a></li>
</ul>
</div>
</nav></div>
</div>
<div class="sidebar-primary-items__end sidebar-primary__section">
</div>
<div id="rtd-footer-container"></div>
</div>
<main id="main-content" class="bd-main" role="main">
<div class="sbt-scroll-pixel-helper"></div>
<div class="bd-content">
<div class="bd-article-container">
<div class="bd-header-article d-print-none">
<div class="header-article-items header-article__inner">
<div class="header-article-items__start">
<div class="header-article-item"><button class="sidebar-toggle primary-toggle btn btn-sm" title="Toggle primary sidebar" data-bs-placement="bottom" data-bs-toggle="tooltip">
<span class="fa-solid fa-bars"></span>
</button></div>
</div>
<div class="header-article-items__end">
<div class="header-article-item">
<div class="article-header-buttons">
<div class="dropdown dropdown-download-buttons">
<button class="btn dropdown-toggle" type="button" data-bs-toggle="dropdown" aria-expanded="false" aria-label="Download this page">
<i class="fas fa-download"></i>
</button>
<ul class="dropdown-menu">
<li><a href="../_sources/modules/14_profiling_ABOUT.md" target="_blank"
class="btn btn-sm btn-download-source-button dropdown-item"
title="Download source file"
data-bs-placement="left" data-bs-toggle="tooltip"
>
<span class="btn__icon-container">
<i class="fas fa-file"></i>
</span>
<span class="btn__text-container">.md</span>
</a>
</li>
<li>
<button onclick="window.print()"
class="btn btn-sm btn-download-pdf-button dropdown-item"
title="Print to PDF"
data-bs-placement="left" data-bs-toggle="tooltip"
>
<span class="btn__icon-container">
<i class="fas fa-file-pdf"></i>
</span>
<span class="btn__text-container">.pdf</span>
</button>
</li>
</ul>
</div>
<button onclick="toggleFullScreen()"
class="btn btn-sm btn-fullscreen-button"
title="Fullscreen mode"
data-bs-placement="bottom" data-bs-toggle="tooltip"
>
<span class="btn__icon-container">
<i class="fas fa-expand"></i>
</span>
</button>
<script>
document.write(`
<button class="btn btn-sm nav-link pst-navbar-icon theme-switch-button" title="light/dark" aria-label="light/dark" data-bs-placement="bottom" data-bs-toggle="tooltip">
<i class="theme-switch fa-solid fa-sun fa-lg" data-mode="light"></i>
<i class="theme-switch fa-solid fa-moon fa-lg" data-mode="dark"></i>
<i class="theme-switch fa-solid fa-circle-half-stroke fa-lg" data-mode="auto"></i>
</button>
`);
</script>
<script>
document.write(`
<button class="btn btn-sm pst-navbar-icon search-button search-button__button" title="Search" aria-label="Search" data-bs-placement="bottom" data-bs-toggle="tooltip">
<i class="fa-solid fa-magnifying-glass fa-lg"></i>
</button>
`);
</script>
<button class="sidebar-toggle secondary-toggle btn btn-sm" title="Toggle secondary sidebar" data-bs-placement="bottom" data-bs-toggle="tooltip">
<span class="fa-solid fa-list"></span>
</button>
</div></div>
</div>
</div>
</div>
<div id="jb-print-docs-body" class="onlyprint">
<h1>14. Profiling - Performance Measurement for ML Systems</h1>
<!-- Table of contents -->
<div id="print-main-content">
<div id="jb-print-toc">
<div>
<h2> Contents </h2>
</div>
<nav aria-label="Page">
<ul class="visible nav section-nav flex-column">
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#overview">Overview</a></li>
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#why-this-matters">Why This Matters</a><ul class="nav section-nav flex-column">
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#production-context-profiling-drives-optimization-economics">Production Context: Profiling Drives Optimization Economics</a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#historical-evolution-from-ad-hoc-timing-to-systematic-measurement">Historical Evolution: From Ad-Hoc Timing to Systematic Measurement</a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#what-youll-actually-build">What Youll Actually Build</a></li>
</ul>
</li>
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#learning-objectives">Learning Objectives</a></li>
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#build-use-reflect">Build → Use → Reflect</a></li>
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#implementation-guide">Implementation Guide</a><ul class="nav section-nav flex-column">
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#core-component-profiler-class">Core Component: Profiler Class</a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#parameter-counting-memory-footprint-analysis">Parameter Counting: Memory Footprint Analysis</a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#flop-counting-computational-cost-analysis">FLOP Counting: Computational Cost Analysis</a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#memory-profiling-understanding-allocation-patterns">Memory Profiling: Understanding Allocation Patterns</a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#latency-measurement-statistical-timing-methodology">Latency Measurement: Statistical Timing Methodology</a></li>
</ul>
</li>
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#getting-started">Getting Started</a><ul class="nav section-nav flex-column">
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#prerequisites">Prerequisites</a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#development-workflow">Development Workflow</a></li>
</ul>
</li>
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#testing">Testing</a><ul class="nav section-nav flex-column">
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#comprehensive-test-suite">Comprehensive Test Suite</a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#test-coverage-areas">Test Coverage Areas</a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#inline-testing-validation">Inline Testing &amp; Validation</a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#manual-testing-examples">Manual Testing Examples</a></li>
</ul>
</li>
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#systems-thinking-questions">Systems Thinking Questions</a><ul class="nav section-nav flex-column">
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#real-world-applications">Real-World Applications</a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#profiling-foundations">Profiling Foundations</a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#performance-characteristics">Performance Characteristics</a></li>
</ul>
</li>
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#ready-to-build">Ready to Build?</a></li>
</ul>
</nav>
</div>
</div>
</div>
<div id="searchbox"></div>
<article class="bd-article">
<section id="profiling-performance-measurement-for-ml-systems">
<h1>14. Profiling - Performance Measurement for ML Systems<a class="headerlink" href="#profiling-performance-measurement-for-ml-systems" title="Link to this heading">#</a></h1>
<p><strong>OPTIMIZATION TIER</strong> | Difficulty: ⭐⭐⭐ (3/4) | Time: 5-6 hours</p>
<section id="overview">
<h2>Overview<a class="headerlink" href="#overview" title="Link to this heading">#</a></h2>
<p>Build profiling tools that measure where compute and memory go in ML systems. This module implements parameter counters, FLOP analyzers, memory trackers, and timing profilers with statistical rigor. Youll profile real models to identify bottlenecks—memory-bound vs compute-bound, attention vs feedforward, batch size effects—and use data to guide optimization decisions.</p>
<p><strong>Optimization Tier Focus</strong>: Modules 1-13 taught you to build ML systems. Modules 14-20 teach you to measure and optimize them. Profiling is the foundation—you cant optimize what you dont measure.</p>
</section>
<section id="why-this-matters">
<h2>Why This Matters<a class="headerlink" href="#why-this-matters" title="Link to this heading">#</a></h2>
<section id="production-context-profiling-drives-optimization-economics">
<h3>Production Context: Profiling Drives Optimization Economics<a class="headerlink" href="#production-context-profiling-drives-optimization-economics" title="Link to this heading">#</a></h3>
<p>Every major ML organization profiles extensively:</p>
<ul class="simple">
<li><p><strong>Google TPU teams</strong> profile every kernel to achieve 40-50% MFU (Model FLOPs Utilization), translating to millions in compute savings</p></li>
<li><p><strong>OpenAI</strong> profiles GPT training runs to identify gradient checkpointing opportunities, reducing memory by 10× with minimal speed cost</p></li>
<li><p><strong>Meta</strong> profiles PyTorch inference serving billions of requests daily, using data to guide operator fusion and quantization decisions</p></li>
<li><p><strong>NVIDIA</strong> uses Nsight profiler to optimize cuDNN kernels, achieving near-theoretical-peak performance on tensor cores</p></li>
</ul>
<p><strong>The Economics</strong>: A 10% optimization on a $10M training run saves $1M. But only if you measure first—guessing wastes engineering time on non-bottlenecks.</p>
</section>
<section id="historical-evolution-from-ad-hoc-timing-to-systematic-measurement">
<h3>Historical Evolution: From Ad-Hoc Timing to Systematic Measurement<a class="headerlink" href="#historical-evolution-from-ad-hoc-timing-to-systematic-measurement" title="Link to this heading">#</a></h3>
<p>Profiling evolved with ML scale:</p>
<ul class="simple">
<li><p><strong>Pre-2012 (Small models)</strong>: Ad-hoc timing with <code class="docutils literal notranslate"><span class="pre">time.time()</span></code>, no systematic methodology</p></li>
<li><p><strong>2012-2017 (Deep learning era)</strong>: NVIDIA profiler, TensorBoard timing; focus on GPU utilization</p></li>
<li><p><strong>2018+ (Production scale)</strong>: Comprehensive profiling (compute, memory, I/O, network); optimization becomes economically critical</p></li>
<li><p><strong>2020+ (Modern systems)</strong>: Automated profiling guides ML compilers; tools like PyTorch Profiler integrate with training workflows</p></li>
</ul>
</section>
<section id="what-youll-actually-build">
<h3>What Youll Actually Build<a class="headerlink" href="#what-youll-actually-build" title="Link to this heading">#</a></h3>
<p>Lets be precise about what you implement in this module:</p>
<p><strong>You WILL build</strong>:</p>
<ul class="simple">
<li><p>Parameter counter: Walks model structure, sums weight and bias elements</p></li>
<li><p>FLOP counter: Calculates theoretical operations for Linear, Conv2d based on dimensions</p></li>
<li><p>Memory profiler: Uses Pythons tracemalloc to track allocations during forward/backward</p></li>
<li><p>Timing profiler: Uses time.perf_counter() with warmup runs and statistical analysis (median latency)</p></li>
</ul>
<p><strong>You will NOT build</strong> (these are production tools requiring kernel instrumentation):</p>
<ul class="simple">
<li><p>GPU profiler (requires CUDA kernel hooks)</p></li>
<li><p>PyTorch Profiler integration (requires autograd instrumentation)</p></li>
<li><p>Operator-level timeline traces (requires framework integration)</p></li>
</ul>
<p><strong>Why this scope matters</strong>: Youll understand profiling fundamentals that transfer to production tools. The techniques you implement (parameter counting formulas, FLOP calculations, statistical timing) are exactly what PyTorch Profiler and TensorBoard use internally. Youre building the same measurement primitives, just without kernel-level instrumentation.</p>
</section>
</section>
<section id="learning-objectives">
<h2>Learning Objectives<a class="headerlink" href="#learning-objectives" title="Link to this heading">#</a></h2>
<p>By the end of this module, you will be able to:</p>
<ul class="simple">
<li><p><strong>Count parameters accurately</strong>: Predict model size and memory footprint by counting weights and biases across different layer types</p></li>
<li><p><strong>Measure computational cost</strong>: Implement FLOP counters that calculate theoretical compute for matrix multiplications, convolutions, and attention operations</p></li>
<li><p><strong>Track memory usage</strong>: Build memory profilers using tracemalloc to measure parameter, activation, and gradient memory during forward and backward passes</p></li>
<li><p><strong>Profile latency rigorously</strong>: Create timing profilers with warmup runs, multiple iterations, and statistical analysis (median, confidence intervals)</p></li>
<li><p><strong>Identify performance bottlenecks</strong>: Analyze profiling data to distinguish memory-bound from compute-bound operations and prioritize optimization efforts</p></li>
</ul>
</section>
<section id="build-use-reflect">
<h2>Build → Use → Reflect<a class="headerlink" href="#build-use-reflect" title="Link to this heading">#</a></h2>
<p>This module follows TinyTorchs <strong>Build → Use → Reflect</strong> framework:</p>
<ol class="arabic simple">
<li><p><strong>Build</strong>: Implement Profiler class with parameter counting, FLOP calculation, memory tracking, and latency measurement using time.perf_counter() and tracemalloc</p></li>
<li><p><strong>Use</strong>: Profile complete models to measure characteristics, compare MLP vs attention operations, analyze batch size impact on throughput, and benchmark different architectures</p></li>
<li><p><strong>Reflect</strong>: Where does compute time actually go in transformers? When is your system memory-bound vs compute-bound? How do measurement choices affect optimization decisions?</p></li>
</ol>
</section>
<section id="implementation-guide">
<h2>Implementation Guide<a class="headerlink" href="#implementation-guide" title="Link to this heading">#</a></h2>
<section id="core-component-profiler-class">
<h3>Core Component: Profiler Class<a class="headerlink" href="#core-component-profiler-class" title="Link to this heading">#</a></h3>
<p>The Profiler class provides comprehensive performance analysis:</p>
<div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="k">class</span><span class="w"> </span><span class="nc">Profiler</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;Professional-grade ML model profiler.</span>
<span class="sd"> Measures parameters, FLOPs, memory, and latency with statistical rigor.</span>
<span class="sd"> Used for bottleneck identification and optimization guidance.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">def</span><span class="w"> </span><span class="fm">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
<span class="bp">self</span><span class="o">.</span><span class="n">measurements</span> <span class="o">=</span> <span class="p">{}</span>
<span class="bp">self</span><span class="o">.</span><span class="n">operation_counts</span> <span class="o">=</span> <span class="n">defaultdict</span><span class="p">(</span><span class="nb">int</span><span class="p">)</span>
<span class="k">def</span><span class="w"> </span><span class="nf">count_parameters</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">model</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="nb">int</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;Count total trainable parameters.</span>
<span class="sd"> Returns:</span>
<span class="sd"> Total parameter count (e.g., 125M for GPT-2 Small)</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">total</span> <span class="o">=</span> <span class="mi">0</span>
<span class="k">if</span> <span class="nb">hasattr</span><span class="p">(</span><span class="n">model</span><span class="p">,</span> <span class="s1">&#39;parameters&#39;</span><span class="p">):</span>
<span class="k">for</span> <span class="n">param</span> <span class="ow">in</span> <span class="n">model</span><span class="o">.</span><span class="n">parameters</span><span class="p">():</span>
<span class="n">total</span> <span class="o">+=</span> <span class="n">param</span><span class="o">.</span><span class="n">data</span><span class="o">.</span><span class="n">size</span> <span class="c1"># Count elements</span>
<span class="k">return</span> <span class="n">total</span>
<span class="k">def</span><span class="w"> </span><span class="nf">count_flops</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">model</span><span class="p">,</span> <span class="n">input_shape</span><span class="p">:</span> <span class="n">Tuple</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="nb">int</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;Count FLOPs (Floating Point Operations) for forward pass.</span>
<span class="sd"> Linear layer: 2 × M × K × N (matmul is M×K @ K×N)</span>
<span class="sd"> Conv2d: 2 × output_h × output_w × kernel_h × kernel_w × in_ch × out_ch</span>
<span class="sd"> Returns:</span>
<span class="sd"> Total FLOPs for one forward pass (hardware-independent)</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="c1"># Implementation calculates based on layer type and dimensions</span>
<span class="k">def</span><span class="w"> </span><span class="nf">measure_memory</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">model</span><span class="p">,</span> <span class="n">input_shape</span><span class="p">:</span> <span class="n">Tuple</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Dict</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;Measure memory usage during forward pass.</span>
<span class="sd"> Uses tracemalloc to track:</span>
<span class="sd"> - Parameter memory (weights, biases)</span>
<span class="sd"> - Activation memory (intermediate tensors)</span>
<span class="sd"> - Peak memory (maximum allocation)</span>
<span class="sd"> Returns:</span>
<span class="sd"> Dict with memory breakdown in MB</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">tracemalloc</span><span class="o">.</span><span class="n">start</span><span class="p">()</span>
<span class="c1"># Run forward pass, measure peak allocation</span>
<span class="k">def</span><span class="w"> </span><span class="nf">measure_latency</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">model</span><span class="p">,</span> <span class="n">input_tensor</span><span class="p">,</span>
<span class="n">warmup</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">10</span><span class="p">,</span> <span class="n">iterations</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">100</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="nb">float</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;Measure inference latency with statistical rigor.</span>
<span class="sd"> Protocol:</span>
<span class="sd"> 1. Warmup runs (cache warming, JIT compilation)</span>
<span class="sd"> 2. Multiple measurements (statistical significance)</span>
<span class="sd"> 3. Median calculation (robust to outliers)</span>
<span class="sd"> Returns:</span>
<span class="sd"> Median latency in milliseconds</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="c1"># Warmup runs (discard results)</span>
<span class="k">for</span> <span class="n">_</span> <span class="ow">in</span> <span class="nb">range</span><span class="p">(</span><span class="n">warmup</span><span class="p">):</span>
<span class="n">_</span> <span class="o">=</span> <span class="n">model</span><span class="o">.</span><span class="n">forward</span><span class="p">(</span><span class="n">input_tensor</span><span class="p">)</span>
<span class="c1"># Timed runs</span>
<span class="n">times</span> <span class="o">=</span> <span class="p">[]</span>
<span class="k">for</span> <span class="n">_</span> <span class="ow">in</span> <span class="nb">range</span><span class="p">(</span><span class="n">iterations</span><span class="p">):</span>
<span class="n">start</span> <span class="o">=</span> <span class="n">time</span><span class="o">.</span><span class="n">perf_counter</span><span class="p">()</span> <span class="c1"># High-precision timer</span>
<span class="n">_</span> <span class="o">=</span> <span class="n">model</span><span class="o">.</span><span class="n">forward</span><span class="p">(</span><span class="n">input_tensor</span><span class="p">)</span>
<span class="n">times</span><span class="o">.</span><span class="n">append</span><span class="p">((</span><span class="n">time</span><span class="o">.</span><span class="n">perf_counter</span><span class="p">()</span> <span class="o">-</span> <span class="n">start</span><span class="p">)</span> <span class="o">*</span> <span class="mi">1000</span><span class="p">)</span> <span class="c1"># Convert to ms</span>
<span class="k">return</span> <span class="n">np</span><span class="o">.</span><span class="n">median</span><span class="p">(</span><span class="n">times</span><span class="p">)</span> <span class="c1"># Median is robust to outliers</span>
</pre></div>
</div>
</section>
<section id="parameter-counting-memory-footprint-analysis">
<h3>Parameter Counting: Memory Footprint Analysis<a class="headerlink" href="#parameter-counting-memory-footprint-analysis" title="Link to this heading">#</a></h3>
<p>Parameter counting predicts model size and memory requirements:</p>
<div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="c1"># Linear layer example</span>
<span class="n">layer</span> <span class="o">=</span> <span class="n">Linear</span><span class="p">(</span><span class="mi">768</span><span class="p">,</span> <span class="mi">3072</span><span class="p">)</span> <span class="c1"># GPT-2 feedforward dimension</span>
<span class="c1"># Manual calculation:</span>
<span class="n">weight_params</span> <span class="o">=</span> <span class="mi">768</span> <span class="err">×</span> <span class="mi">3072</span> <span class="o">=</span> <span class="mi">2</span><span class="p">,</span><span class="mi">359</span><span class="p">,</span><span class="mi">296</span>
<span class="n">bias_params</span> <span class="o">=</span> <span class="mi">3072</span>
<span class="n">total_params</span> <span class="o">=</span> <span class="mi">2</span><span class="p">,</span><span class="mi">362</span><span class="p">,</span><span class="mi">368</span>
<span class="c1"># Memory at FP32 (4 bytes per parameter):</span>
<span class="n">memory_bytes</span> <span class="o">=</span> <span class="mi">2</span><span class="p">,</span><span class="mi">362</span><span class="p">,</span><span class="mi">368</span> <span class="err">×</span> <span class="mi">4</span> <span class="o">=</span> <span class="mi">9</span><span class="p">,</span><span class="mi">449</span><span class="p">,</span><span class="mi">472</span> <span class="nb">bytes</span> <span class="o">=</span> <span class="mf">9.01</span> <span class="n">MB</span>
<span class="c1"># Profiler implementation:</span>
<span class="n">profiler</span> <span class="o">=</span> <span class="n">Profiler</span><span class="p">()</span>
<span class="n">count</span> <span class="o">=</span> <span class="n">profiler</span><span class="o">.</span><span class="n">count_parameters</span><span class="p">(</span><span class="n">layer</span><span class="p">)</span>
<span class="k">assert</span> <span class="n">count</span> <span class="o">==</span> <span class="mi">2_362_368</span>
<span class="c1"># Why this matters:</span>
<span class="c1"># GPT-2 Small: 124M params → 496 MB</span>
<span class="c1"># GPT-2 XL: 1.5B params → 6.0 GB</span>
<span class="c1"># Knowing parameter count predicts deployment hardware requirements</span>
</pre></div>
</div>
<p><strong>Parameter Counting Strategy</strong>:</p>
<ul class="simple">
<li><p>Linear layers: <code class="docutils literal notranslate"><span class="pre">(input_features</span> <span class="pre">×</span> <span class="pre">output_features)</span> <span class="pre">+</span> <span class="pre">output_features</span></code></p></li>
<li><p>Conv2d layers: <code class="docutils literal notranslate"><span class="pre">(kernel_h</span> <span class="pre">×</span> <span class="pre">kernel_w</span> <span class="pre">×</span> <span class="pre">in_channels</span> <span class="pre">×</span> <span class="pre">out_channels)</span> <span class="pre">+</span> <span class="pre">out_channels</span></code></p></li>
<li><p>Embeddings: <code class="docutils literal notranslate"><span class="pre">vocab_size</span> <span class="pre">×</span> <span class="pre">embedding_dim</span></code></p></li>
<li><p>Attention: Count Q/K/V projection weights separately</p></li>
</ul>
</section>
<section id="flop-counting-computational-cost-analysis">
<h3>FLOP Counting: Computational Cost Analysis<a class="headerlink" href="#flop-counting-computational-cost-analysis" title="Link to this heading">#</a></h3>
<p>FLOPs measure compute independently of hardware:</p>
<div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="c1"># Matrix multiplication FLOP calculation</span>
<span class="c1"># C = A @ B where A is (M, K) and B is (K, N)</span>
<span class="k">def</span><span class="w"> </span><span class="nf">count_matmul_flops</span><span class="p">(</span><span class="n">M</span><span class="p">,</span> <span class="n">K</span><span class="p">,</span> <span class="n">N</span><span class="p">):</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;Each output element C[i,j] requires K multiply-adds.</span>
<span class="sd"> Total outputs: M × N</span>
<span class="sd"> FLOPs per output: 2 × K (multiply + add)</span>
<span class="sd"> Total FLOPs: 2 × M × K × N</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="mi">2</span> <span class="o">*</span> <span class="n">M</span> <span class="o">*</span> <span class="n">K</span> <span class="o">*</span> <span class="n">N</span>
<span class="c1"># Example: GPT-2 feedforward forward pass</span>
<span class="n">batch_size</span> <span class="o">=</span> <span class="mi">32</span>
<span class="n">seq_len</span> <span class="o">=</span> <span class="mi">512</span>
<span class="n">d_model</span> <span class="o">=</span> <span class="mi">768</span>
<span class="n">d_ff</span> <span class="o">=</span> <span class="mi">3072</span>
<span class="c1"># First linear: (batch × seq, d_model) @ (d_model, d_ff)</span>
<span class="n">flops_1</span> <span class="o">=</span> <span class="n">count_matmul_flops</span><span class="p">(</span><span class="n">batch_size</span> <span class="o">*</span> <span class="n">seq_len</span><span class="p">,</span> <span class="n">d_model</span><span class="p">,</span> <span class="n">d_ff</span><span class="p">)</span>
<span class="c1"># = 2 × 16384 × 768 × 3072 = 77,309,411,328 FLOPs</span>
<span class="c1"># Second linear: (batch × seq, d_ff) @ (d_ff, d_model)</span>
<span class="n">flops_2</span> <span class="o">=</span> <span class="n">count_matmul_flops</span><span class="p">(</span><span class="n">batch_size</span> <span class="o">*</span> <span class="n">seq_len</span><span class="p">,</span> <span class="n">d_ff</span><span class="p">,</span> <span class="n">d_model</span><span class="p">)</span>
<span class="c1"># = 2 × 16384 × 3072 × 768 = 77,309,411,328 FLOPs</span>
<span class="n">total_flops</span> <span class="o">=</span> <span class="n">flops_1</span> <span class="o">+</span> <span class="n">flops_2</span> <span class="c1"># ~154 GFLOPs for one feedforward layer</span>
<span class="c1"># Hardware context:</span>
<span class="c1"># NVIDIA A100: 312 TFLOPS (FP16) → theoretical time = 154 / 312000 = 0.5 ms</span>
<span class="c1"># Actual time will be higher due to memory bandwidth and kernel overhead</span>
</pre></div>
</div>
<p><strong>FLOP Formulas Reference</strong>:</p>
<div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="c1"># Linear layer</span>
<span class="n">flops</span> <span class="o">=</span> <span class="mi">2</span> <span class="err">×</span> <span class="n">batch_size</span> <span class="err">×</span> <span class="n">seq_len</span> <span class="err">×</span> <span class="n">input_features</span> <span class="err">×</span> <span class="n">output_features</span>
<span class="c1"># Conv2d</span>
<span class="n">flops</span> <span class="o">=</span> <span class="mi">2</span> <span class="err">×</span> <span class="n">batch</span> <span class="err">×</span> <span class="n">output_h</span> <span class="err">×</span> <span class="n">output_w</span> <span class="err">×</span> <span class="n">kernel_h</span> <span class="err">×</span> <span class="n">kernel_w</span> <span class="err">×</span> <span class="n">in_ch</span> <span class="err">×</span> <span class="n">out_ch</span>
<span class="c1"># Multi-head attention (simplified)</span>
<span class="c1"># QKV projections: 3 × linear projections</span>
<span class="c1"># Attention scores: batch × heads × seq × seq × d_k</span>
<span class="c1"># Attention weighting: batch × heads × seq × seq × d_k</span>
<span class="c1"># Output projection: 1 × linear projection</span>
<span class="n">flops</span> <span class="o">=</span> <span class="p">(</span><span class="mi">4</span> <span class="err">×</span> <span class="n">batch</span> <span class="err">×</span> <span class="n">seq</span> <span class="err">×</span> <span class="n">d_model</span> <span class="err">×</span> <span class="n">d_model</span><span class="p">)</span> <span class="o">+</span>
<span class="p">(</span><span class="mi">4</span> <span class="err">×</span> <span class="n">batch</span> <span class="err">×</span> <span class="n">heads</span> <span class="err">×</span> <span class="n">seq</span> <span class="err">×</span> <span class="n">seq</span> <span class="err">×</span> <span class="n">d_k</span><span class="p">)</span>
</pre></div>
</div>
</section>
<section id="memory-profiling-understanding-allocation-patterns">
<h3>Memory Profiling: Understanding Allocation Patterns<a class="headerlink" href="#memory-profiling-understanding-allocation-patterns" title="Link to this heading">#</a></h3>
<p>Memory profiling reveals where RAM goes during training:</p>
<div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="k">class</span><span class="w"> </span><span class="nc">MemoryProfiler</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;Track memory allocations and identify usage patterns.&quot;&quot;&quot;</span>
<span class="k">def</span><span class="w"> </span><span class="fm">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
<span class="bp">self</span><span class="o">.</span><span class="n">snapshots</span> <span class="o">=</span> <span class="p">[]</span>
<span class="k">def</span><span class="w"> </span><span class="nf">snapshot</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">label</span><span class="p">:</span> <span class="nb">str</span><span class="p">):</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;Take memory snapshot at execution point.&quot;&quot;&quot;</span>
<span class="kn">import</span><span class="w"> </span><span class="nn">psutil</span>
<span class="n">process</span> <span class="o">=</span> <span class="n">psutil</span><span class="o">.</span><span class="n">Process</span><span class="p">()</span>
<span class="n">mem_info</span> <span class="o">=</span> <span class="n">process</span><span class="o">.</span><span class="n">memory_info</span><span class="p">()</span>
<span class="bp">self</span><span class="o">.</span><span class="n">snapshots</span><span class="o">.</span><span class="n">append</span><span class="p">({</span>
<span class="s1">&#39;label&#39;</span><span class="p">:</span> <span class="n">label</span><span class="p">,</span>
<span class="s1">&#39;rss&#39;</span><span class="p">:</span> <span class="n">mem_info</span><span class="o">.</span><span class="n">rss</span> <span class="o">/</span> <span class="mi">1024</span><span class="o">**</span><span class="mi">2</span><span class="p">,</span> <span class="c1"># Resident Set Size (MB)</span>
<span class="s1">&#39;timestamp&#39;</span><span class="p">:</span> <span class="n">time</span><span class="o">.</span><span class="n">time</span><span class="p">()</span>
<span class="p">})</span>
<span class="k">def</span><span class="w"> </span><span class="nf">report</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;Generate memory usage report.&quot;&quot;&quot;</span>
<span class="nb">print</span><span class="p">(</span><span class="s2">&quot;Memory Timeline:&quot;</span><span class="p">)</span>
<span class="k">for</span> <span class="n">i</span><span class="p">,</span> <span class="n">snap</span> <span class="ow">in</span> <span class="nb">enumerate</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">snapshots</span><span class="p">):</span>
<span class="n">delta</span> <span class="o">=</span> <span class="s2">&quot;&quot;</span>
<span class="k">if</span> <span class="n">i</span> <span class="o">&gt;</span> <span class="mi">0</span><span class="p">:</span>
<span class="n">delta_val</span> <span class="o">=</span> <span class="n">snap</span><span class="p">[</span><span class="s1">&#39;rss&#39;</span><span class="p">]</span> <span class="o">-</span> <span class="bp">self</span><span class="o">.</span><span class="n">snapshots</span><span class="p">[</span><span class="n">i</span><span class="o">-</span><span class="mi">1</span><span class="p">][</span><span class="s1">&#39;rss&#39;</span><span class="p">]</span>
<span class="n">delta</span> <span class="o">=</span> <span class="sa">f</span><span class="s2">&quot; (</span><span class="si">{</span><span class="n">delta_val</span><span class="si">:</span><span class="s2">+.2f</span><span class="si">}</span><span class="s2"> MB)&quot;</span>
<span class="nb">print</span><span class="p">(</span><span class="sa">f</span><span class="s2">&quot; </span><span class="si">{</span><span class="n">snap</span><span class="p">[</span><span class="s1">&#39;label&#39;</span><span class="p">]</span><span class="si">:</span><span class="s2">30s</span><span class="si">}</span><span class="s2">: </span><span class="si">{</span><span class="n">snap</span><span class="p">[</span><span class="s1">&#39;rss&#39;</span><span class="p">]</span><span class="si">:</span><span class="s2">8.2f</span><span class="si">}</span><span class="s2"> MB</span><span class="si">{</span><span class="n">delta</span><span class="si">}</span><span class="s2">&quot;</span><span class="p">)</span>
<span class="c1"># Example: Profile transformer forward pass</span>
<span class="n">mem</span> <span class="o">=</span> <span class="n">MemoryProfiler</span><span class="p">()</span>
<span class="n">mem</span><span class="o">.</span><span class="n">snapshot</span><span class="p">(</span><span class="s2">&quot;baseline&quot;</span><span class="p">)</span>
<span class="c1"># Forward pass</span>
<span class="n">output</span> <span class="o">=</span> <span class="n">model</span><span class="o">.</span><span class="n">forward</span><span class="p">(</span><span class="n">input_tensor</span><span class="p">)</span>
<span class="n">mem</span><span class="o">.</span><span class="n">snapshot</span><span class="p">(</span><span class="s2">&quot;after_forward&quot;</span><span class="p">)</span>
<span class="c1"># Backward pass</span>
<span class="n">loss</span> <span class="o">=</span> <span class="n">criterion</span><span class="p">(</span><span class="n">output</span><span class="p">,</span> <span class="n">target</span><span class="p">)</span>
<span class="n">loss</span><span class="o">.</span><span class="n">backward</span><span class="p">()</span>
<span class="n">mem</span><span class="o">.</span><span class="n">snapshot</span><span class="p">(</span><span class="s2">&quot;after_backward&quot;</span><span class="p">)</span>
<span class="c1"># Update weights</span>
<span class="n">optimizer</span><span class="o">.</span><span class="n">step</span><span class="p">()</span>
<span class="n">mem</span><span class="o">.</span><span class="n">snapshot</span><span class="p">(</span><span class="s2">&quot;after_optimizer&quot;</span><span class="p">)</span>
<span class="n">mem</span><span class="o">.</span><span class="n">report</span><span class="p">()</span>
<span class="c1"># Output interpretation:</span>
<span class="c1"># baseline : 1024.00 MB</span>
<span class="c1"># after_forward : 1124.00 MB (+100.00 MB) ← Activation memory</span>
<span class="c1"># after_backward : 1624.00 MB (+500.00 MB) ← Gradient memory</span>
<span class="c1"># after_optimizer : 2124.00 MB (+500.00 MB) ← Adam state (momentum + velocity)</span>
<span class="c1">#</span>
<span class="c1"># Total training memory = 2.1× forward memory (for Adam optimizer)</span>
</pre></div>
</div>
<p><strong>Memory Components Breakdown</strong>:</p>
<div class="highlight-default notranslate"><div class="highlight"><pre><span></span>Training Memory = Parameters + Activations + Gradients + Optimizer State
Example for GPT-2 Small (124M parameters):
Parameters: 496 MB (124M × 4 bytes)
Activations: 200 MB (depends on batch size and sequence length)
Gradients: 496 MB (same as parameters)
Adam state: 992 MB (momentum + velocity = 2× parameters)
─────────────────────────────────────
Total: 2184 MB (4.4× parameter memory!)
Optimization strategies by component:
- Parameters: Quantization (reduce precision)
- Activations: Gradient checkpointing (recompute instead of store)
- Gradients: Mixed precision (FP16 gradients)
- Optimizer: SGD instead of Adam (0× vs 2× parameter memory)
</pre></div>
</div>
</section>
<section id="latency-measurement-statistical-timing-methodology">
<h3>Latency Measurement: Statistical Timing Methodology<a class="headerlink" href="#latency-measurement-statistical-timing-methodology" title="Link to this heading">#</a></h3>
<p>Accurate latency measurement requires handling variance:</p>
<div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="k">def</span><span class="w"> </span><span class="nf">measure_latency_correctly</span><span class="p">(</span><span class="n">model</span><span class="p">,</span> <span class="n">input_tensor</span><span class="p">):</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;Production-quality latency measurement.&quot;&quot;&quot;</span>
<span class="c1"># Step 1: Warmup runs (stabilize system state)</span>
<span class="c1"># - JIT compilation happens on first runs</span>
<span class="c1"># - CPU/GPU caches warm up</span>
<span class="c1"># - Operating system scheduling stabilizes</span>
<span class="n">warmup_runs</span> <span class="o">=</span> <span class="mi">10</span>
<span class="k">for</span> <span class="n">_</span> <span class="ow">in</span> <span class="nb">range</span><span class="p">(</span><span class="n">warmup_runs</span><span class="p">):</span>
<span class="n">_</span> <span class="o">=</span> <span class="n">model</span><span class="o">.</span><span class="n">forward</span><span class="p">(</span><span class="n">input_tensor</span><span class="p">)</span>
<span class="c1"># Step 2: Multiple measurements (statistical significance)</span>
<span class="n">times</span> <span class="o">=</span> <span class="p">[]</span>
<span class="n">measurement_runs</span> <span class="o">=</span> <span class="mi">100</span>
<span class="k">for</span> <span class="n">_</span> <span class="ow">in</span> <span class="nb">range</span><span class="p">(</span><span class="n">measurement_runs</span><span class="p">):</span>
<span class="n">start</span> <span class="o">=</span> <span class="n">time</span><span class="o">.</span><span class="n">perf_counter</span><span class="p">()</span> <span class="c1"># Nanosecond precision</span>
<span class="n">_</span> <span class="o">=</span> <span class="n">model</span><span class="o">.</span><span class="n">forward</span><span class="p">(</span><span class="n">input_tensor</span><span class="p">)</span>
<span class="n">elapsed</span> <span class="o">=</span> <span class="n">time</span><span class="o">.</span><span class="n">perf_counter</span><span class="p">()</span> <span class="o">-</span> <span class="n">start</span>
<span class="n">times</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">elapsed</span> <span class="o">*</span> <span class="mi">1000</span><span class="p">)</span> <span class="c1"># Convert to milliseconds</span>
<span class="c1"># Step 3: Statistical analysis</span>
<span class="n">times</span> <span class="o">=</span> <span class="n">np</span><span class="o">.</span><span class="n">array</span><span class="p">(</span><span class="n">times</span><span class="p">)</span>
<span class="n">results</span> <span class="o">=</span> <span class="p">{</span>
<span class="s1">&#39;mean&#39;</span><span class="p">:</span> <span class="n">np</span><span class="o">.</span><span class="n">mean</span><span class="p">(</span><span class="n">times</span><span class="p">),</span>
<span class="s1">&#39;median&#39;</span><span class="p">:</span> <span class="n">np</span><span class="o">.</span><span class="n">median</span><span class="p">(</span><span class="n">times</span><span class="p">),</span> <span class="c1"># Robust to outliers</span>
<span class="s1">&#39;std&#39;</span><span class="p">:</span> <span class="n">np</span><span class="o">.</span><span class="n">std</span><span class="p">(</span><span class="n">times</span><span class="p">),</span>
<span class="s1">&#39;min&#39;</span><span class="p">:</span> <span class="n">np</span><span class="o">.</span><span class="n">min</span><span class="p">(</span><span class="n">times</span><span class="p">),</span>
<span class="s1">&#39;max&#39;</span><span class="p">:</span> <span class="n">np</span><span class="o">.</span><span class="n">max</span><span class="p">(</span><span class="n">times</span><span class="p">),</span>
<span class="s1">&#39;p50&#39;</span><span class="p">:</span> <span class="n">np</span><span class="o">.</span><span class="n">percentile</span><span class="p">(</span><span class="n">times</span><span class="p">,</span> <span class="mi">50</span><span class="p">),</span> <span class="c1"># Median</span>
<span class="s1">&#39;p95&#39;</span><span class="p">:</span> <span class="n">np</span><span class="o">.</span><span class="n">percentile</span><span class="p">(</span><span class="n">times</span><span class="p">,</span> <span class="mi">95</span><span class="p">),</span> <span class="c1"># 95th percentile</span>
<span class="s1">&#39;p99&#39;</span><span class="p">:</span> <span class="n">np</span><span class="o">.</span><span class="n">percentile</span><span class="p">(</span><span class="n">times</span><span class="p">,</span> <span class="mi">99</span><span class="p">)</span> <span class="c1"># 99th percentile (tail latency)</span>
<span class="p">}</span>
<span class="k">return</span> <span class="n">results</span>
<span class="c1"># Example output:</span>
<span class="c1"># {</span>
<span class="c1"># &#39;mean&#39;: 5.234,</span>
<span class="c1"># &#39;median&#39;: 5.180, ← Use this for reporting (robust)</span>
<span class="c1"># &#39;std&#39;: 0.456,</span>
<span class="c1"># &#39;min&#39;: 4.890,</span>
<span class="c1"># &#39;max&#39;: 8.120, ← Outlier (OS scheduling event)</span>
<span class="c1"># &#39;p50&#39;: 5.180,</span>
<span class="c1"># &#39;p95&#39;: 5.890,</span>
<span class="c1"># &#39;p99&#39;: 6.340 ← Important for user-facing latency</span>
<span class="c1"># }</span>
<span class="c1"># Why median, not mean?</span>
<span class="c1"># Mean is sensitive to outliers (8.120 ms max skews average)</span>
<span class="c1"># Median represents typical performance</span>
<span class="c1"># For user-facing systems, report p95 or p99 (worst-case experience)</span>
</pre></div>
</div>
<p><strong>Measurement Pitfalls and Solutions</strong>:</p>
<div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="c1"># ❌ WRONG: Single measurement</span>
<span class="n">start</span> <span class="o">=</span> <span class="n">time</span><span class="o">.</span><span class="n">time</span><span class="p">()</span> <span class="c1"># Low precision</span>
<span class="n">output</span> <span class="o">=</span> <span class="n">model</span><span class="p">(</span><span class="nb">input</span><span class="p">)</span>
<span class="n">latency</span> <span class="o">=</span> <span class="n">time</span><span class="o">.</span><span class="n">time</span><span class="p">()</span> <span class="o">-</span> <span class="n">start</span> <span class="c1"># Affected by system noise</span>
<span class="c1"># ✅ CORRECT: Statistical measurement</span>
<span class="n">profiler</span> <span class="o">=</span> <span class="n">Profiler</span><span class="p">()</span>
<span class="n">latency</span> <span class="o">=</span> <span class="n">profiler</span><span class="o">.</span><span class="n">measure_latency</span><span class="p">(</span><span class="n">model</span><span class="p">,</span> <span class="nb">input</span><span class="p">,</span> <span class="n">warmup</span><span class="o">=</span><span class="mi">10</span><span class="p">,</span> <span class="n">iterations</span><span class="o">=</span><span class="mi">100</span><span class="p">)</span>
<span class="c1"># Returns median of 100 measurements after 10 warmup runs</span>
<span class="c1"># ❌ WRONG: Measuring cold start</span>
<span class="n">latency</span> <span class="o">=</span> <span class="n">time_function_once</span><span class="p">(</span><span class="n">model</span><span class="o">.</span><span class="n">forward</span><span class="p">,</span> <span class="nb">input</span><span class="p">)</span> <span class="c1"># Includes JIT compilation</span>
<span class="c1"># ✅ CORRECT: Warmup runs</span>
<span class="k">for</span> <span class="n">_</span> <span class="ow">in</span> <span class="nb">range</span><span class="p">(</span><span class="mi">10</span><span class="p">):</span>
<span class="n">model</span><span class="o">.</span><span class="n">forward</span><span class="p">(</span><span class="nb">input</span><span class="p">)</span> <span class="c1"># Discard these results</span>
<span class="n">latency</span> <span class="o">=</span> <span class="n">measure_with_statistics</span><span class="p">(</span><span class="n">model</span><span class="o">.</span><span class="n">forward</span><span class="p">,</span> <span class="nb">input</span><span class="p">)</span> <span class="c1"># Now measure</span>
<span class="c1"># ❌ WRONG: Using mean with outliers</span>
<span class="n">times</span> <span class="o">=</span> <span class="p">[</span><span class="mf">5.1</span><span class="p">,</span> <span class="mf">5.2</span><span class="p">,</span> <span class="mf">5.0</span><span class="p">,</span> <span class="mf">5.3</span><span class="p">,</span> <span class="mf">50.0</span><span class="p">]</span> <span class="c1"># 50ms outlier from OS scheduling</span>
<span class="n">mean</span> <span class="o">=</span> <span class="n">np</span><span class="o">.</span><span class="n">mean</span><span class="p">(</span><span class="n">times</span><span class="p">)</span> <span class="c1"># = 14.12 ms (misleading!)</span>
<span class="c1"># ✅ CORRECT: Using median</span>
<span class="n">median</span> <span class="o">=</span> <span class="n">np</span><span class="o">.</span><span class="n">median</span><span class="p">(</span><span class="n">times</span><span class="p">)</span> <span class="c1"># = 5.2 ms (representative)</span>
</pre></div>
</div>
</section>
</section>
<section id="getting-started">
<h2>Getting Started<a class="headerlink" href="#getting-started" title="Link to this heading">#</a></h2>
<section id="prerequisites">
<h3>Prerequisites<a class="headerlink" href="#prerequisites" title="Link to this heading">#</a></h3>
<p>Ensure you understand the foundations from previous modules:</p>
<div class="highlight-bash notranslate"><div class="highlight"><pre><span></span><span class="c1"># Activate TinyTorch environment</span>
<span class="nb">source</span><span class="w"> </span>scripts/activate-tinytorch
<span class="c1"># Verify prerequisite modules (all modules 1-13)</span>
tito<span class="w"> </span><span class="nb">test</span><span class="w"> </span>tensor
tito<span class="w"> </span><span class="nb">test</span><span class="w"> </span>activations
tito<span class="w"> </span><span class="nb">test</span><span class="w"> </span>transformer
</pre></div>
</div>
<p><strong>Why these prerequisites</strong>: Youll profile models built in Modules 1-13. Understanding the implementations helps you interpret profiling results (e.g., why attention is memory-bound).</p>
</section>
<section id="development-workflow">
<h3>Development Workflow<a class="headerlink" href="#development-workflow" title="Link to this heading">#</a></h3>
<ol class="arabic simple">
<li><p><strong>Open the development file</strong>: <code class="docutils literal notranslate"><span class="pre">modules/14_profiling/profiling_dev.ipynb</span></code> or <code class="docutils literal notranslate"><span class="pre">.py</span></code></p></li>
<li><p><strong>Implement parameter counting</strong>: Walk model structure, sum parameter elements</p></li>
<li><p><strong>Build FLOP counter</strong>: Calculate operations based on layer types and dimensions</p></li>
<li><p><strong>Create memory profiler</strong>: Use tracemalloc to track allocations during forward/backward</p></li>
<li><p><strong>Add timing profiler</strong>: Implement warmup runs, multiple measurements, statistical analysis</p></li>
<li><p><strong>Implement advanced profiling</strong>: Build <code class="docutils literal notranslate"><span class="pre">profile_forward_pass()</span></code> and <code class="docutils literal notranslate"><span class="pre">profile_backward_pass()</span></code> combining all metrics</p></li>
<li><p><strong>Export and verify</strong>: <code class="docutils literal notranslate"><span class="pre">tito</span> <span class="pre">module</span> <span class="pre">complete</span> <span class="pre">14</span> <span class="pre">&amp;&amp;</span> <span class="pre">tito</span> <span class="pre">test</span> <span class="pre">profiling</span></code></p></li>
</ol>
<p><strong>Development tips</strong>:</p>
<div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="c1"># Test parameter counting manually first</span>
<span class="n">layer</span> <span class="o">=</span> <span class="n">Linear</span><span class="p">(</span><span class="mi">128</span><span class="p">,</span> <span class="mi">64</span><span class="p">)</span>
<span class="n">expected_params</span> <span class="o">=</span> <span class="p">(</span><span class="mi">128</span> <span class="o">*</span> <span class="mi">64</span><span class="p">)</span> <span class="o">+</span> <span class="mi">64</span> <span class="c1"># weight + bias = 8256</span>
<span class="n">actual_params</span> <span class="o">=</span> <span class="n">profiler</span><span class="o">.</span><span class="n">count_parameters</span><span class="p">(</span><span class="n">layer</span><span class="p">)</span>
<span class="k">assert</span> <span class="n">actual_params</span> <span class="o">==</span> <span class="n">expected_params</span>
<span class="c1"># Verify FLOP calculations with small examples</span>
<span class="n">flops</span> <span class="o">=</span> <span class="n">profiler</span><span class="o">.</span><span class="n">count_flops</span><span class="p">(</span><span class="n">layer</span><span class="p">,</span> <span class="p">(</span><span class="mi">1</span><span class="p">,</span> <span class="mi">128</span><span class="p">))</span>
<span class="n">expected_flops</span> <span class="o">=</span> <span class="mi">2</span> <span class="o">*</span> <span class="mi">128</span> <span class="o">*</span> <span class="mi">64</span> <span class="c1"># matmul FLOPs = 16384</span>
<span class="k">assert</span> <span class="n">flops</span> <span class="o">==</span> <span class="n">expected_flops</span>
<span class="c1"># Check memory profiler returns expected keys</span>
<span class="n">mem</span> <span class="o">=</span> <span class="n">profiler</span><span class="o">.</span><span class="n">measure_memory</span><span class="p">(</span><span class="n">layer</span><span class="p">,</span> <span class="p">(</span><span class="mi">32</span><span class="p">,</span> <span class="mi">128</span><span class="p">))</span>
<span class="k">assert</span> <span class="s1">&#39;parameter_memory_mb&#39;</span> <span class="ow">in</span> <span class="n">mem</span>
<span class="k">assert</span> <span class="s1">&#39;activation_memory_mb&#39;</span> <span class="ow">in</span> <span class="n">mem</span>
<span class="k">assert</span> <span class="s1">&#39;peak_memory_mb&#39;</span> <span class="ow">in</span> <span class="n">mem</span>
<span class="c1"># Validate latency measurement stability</span>
<span class="n">latencies</span> <span class="o">=</span> <span class="p">[</span><span class="n">profiler</span><span class="o">.</span><span class="n">measure_latency</span><span class="p">(</span><span class="n">layer</span><span class="p">,</span> <span class="n">input_tensor</span><span class="p">)</span> <span class="k">for</span> <span class="n">_</span> <span class="ow">in</span> <span class="nb">range</span><span class="p">(</span><span class="mi">3</span><span class="p">)]</span>
<span class="n">std_dev</span> <span class="o">=</span> <span class="n">np</span><span class="o">.</span><span class="n">std</span><span class="p">(</span><span class="n">latencies</span><span class="p">)</span>
<span class="k">assert</span> <span class="n">std_dev</span> <span class="o">&lt;</span> <span class="n">np</span><span class="o">.</span><span class="n">mean</span><span class="p">(</span><span class="n">latencies</span><span class="p">)</span> <span class="o">*</span> <span class="mf">0.2</span> <span class="c1"># Coefficient of variation &lt; 20%</span>
</pre></div>
</div>
</section>
</section>
<section id="testing">
<h2>Testing<a class="headerlink" href="#testing" title="Link to this heading">#</a></h2>
<section id="comprehensive-test-suite">
<h3>Comprehensive Test Suite<a class="headerlink" href="#comprehensive-test-suite" title="Link to this heading">#</a></h3>
<p>Run the full test suite to verify profiling functionality:</p>
<div class="highlight-bash notranslate"><div class="highlight"><pre><span></span><span class="c1"># TinyTorch CLI (recommended)</span>
tito<span class="w"> </span><span class="nb">test</span><span class="w"> </span>profiling
<span class="c1"># Direct pytest execution</span>
python<span class="w"> </span>-m<span class="w"> </span>pytest<span class="w"> </span>tests/<span class="w"> </span>-k<span class="w"> </span>profiling<span class="w"> </span>-v
</pre></div>
</div>
</section>
<section id="test-coverage-areas">
<h3>Test Coverage Areas<a class="headerlink" href="#test-coverage-areas" title="Link to this heading">#</a></h3>
<ul class="simple">
<li><p><strong>Parameter counting accuracy</strong>: Verifies correct counts for Linear, Conv2d, models with/without parameters</p></li>
<li><p><strong>FLOP calculation correctness</strong>: Validates formulas for different layer types (Linear, Conv2d, attention)</p></li>
<li><p><strong>Memory measurement reliability</strong>: Checks tracemalloc integration, memory component tracking</p></li>
<li><p><strong>Latency measurement consistency</strong>: Tests statistical timing with warmup runs and multiple iterations</p></li>
<li><p><strong>Advanced profiling completeness</strong>: Validates forward/backward profiling returns all required metrics</p></li>
</ul>
</section>
<section id="inline-testing-validation">
<h3>Inline Testing &amp; Validation<a class="headerlink" href="#inline-testing-validation" title="Link to this heading">#</a></h3>
<p>The module includes comprehensive unit tests:</p>
<div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="c1"># Parameter counting validation</span>
<span class="err">🔬</span> <span class="n">Unit</span> <span class="n">Test</span><span class="p">:</span> <span class="n">Parameter</span> <span class="n">Counting</span><span class="o">...</span>
<span class="err"></span> <span class="n">Simple</span> <span class="n">model</span><span class="p">:</span> <span class="mi">55</span> <span class="n">parameters</span> <span class="p">(</span><span class="mi">10</span><span class="err">×</span><span class="mi">5</span> <span class="n">weight</span> <span class="o">+</span> <span class="mi">5</span> <span class="n">bias</span><span class="p">)</span>
<span class="err"></span> <span class="n">No</span> <span class="n">parameter</span> <span class="n">model</span><span class="p">:</span> <span class="mi">0</span> <span class="n">parameters</span>
<span class="err"></span> <span class="n">Direct</span> <span class="n">tensor</span><span class="p">:</span> <span class="mi">0</span> <span class="n">parameters</span>
<span class="err"></span> <span class="n">Parameter</span> <span class="n">counting</span> <span class="n">works</span> <span class="n">correctly</span><span class="err">!</span>
<span class="c1"># FLOP counting validation</span>
<span class="err">🔬</span> <span class="n">Unit</span> <span class="n">Test</span><span class="p">:</span> <span class="n">FLOP</span> <span class="n">Counting</span><span class="o">...</span>
<span class="err"></span> <span class="n">Tensor</span> <span class="n">operation</span><span class="p">:</span> <span class="mi">32</span> <span class="n">FLOPs</span>
<span class="err"></span> <span class="n">Linear</span> <span class="n">layer</span><span class="p">:</span> <span class="mi">16384</span> <span class="n">FLOPs</span> <span class="p">(</span><span class="mi">128</span> <span class="err">×</span> <span class="mi">64</span> <span class="err">×</span> <span class="mi">2</span><span class="p">)</span>
<span class="err"></span> <span class="n">Batch</span> <span class="n">independence</span><span class="p">:</span> <span class="mi">16384</span> <span class="n">FLOPs</span> <span class="p">(</span><span class="n">same</span> <span class="k">for</span> <span class="n">batch</span> <span class="mi">1</span> <span class="ow">and</span> <span class="mi">32</span><span class="p">)</span>
<span class="err"></span> <span class="n">FLOP</span> <span class="n">counting</span> <span class="n">works</span> <span class="n">correctly</span><span class="err">!</span>
<span class="c1"># Memory measurement validation</span>
<span class="err">🔬</span> <span class="n">Unit</span> <span class="n">Test</span><span class="p">:</span> <span class="n">Memory</span> <span class="n">Measurement</span><span class="o">...</span>
<span class="err"></span> <span class="n">Basic</span> <span class="n">measurement</span><span class="p">:</span> <span class="mf">0.153</span> <span class="n">MB</span> <span class="n">peak</span>
<span class="err"></span> <span class="n">Scaling</span><span class="p">:</span> <span class="n">Small</span> <span class="mf">0.002</span> <span class="n">MB</span> <span class="err"></span> <span class="n">Large</span> <span class="mf">0.020</span> <span class="n">MB</span>
<span class="err"></span> <span class="n">Efficiency</span><span class="p">:</span> <span class="mf">0.524</span> <span class="p">(</span><span class="mi">0</span><span class="o">-</span><span class="mi">1</span> <span class="nb">range</span><span class="p">)</span>
<span class="err"></span> <span class="n">Memory</span> <span class="n">measurement</span> <span class="n">works</span> <span class="n">correctly</span><span class="err">!</span>
<span class="c1"># Latency measurement validation</span>
<span class="err">🔬</span> <span class="n">Unit</span> <span class="n">Test</span><span class="p">:</span> <span class="n">Latency</span> <span class="n">Measurement</span><span class="o">...</span>
<span class="err"></span> <span class="n">Basic</span> <span class="n">latency</span><span class="p">:</span> <span class="mf">0.008</span> <span class="n">ms</span>
<span class="err"></span> <span class="n">Consistency</span><span class="p">:</span> <span class="mf">0.010</span> <span class="err">±</span> <span class="mf">0.002</span> <span class="n">ms</span>
<span class="err"></span> <span class="n">Scaling</span><span class="p">:</span> <span class="n">Small</span> <span class="mf">0.006</span> <span class="n">ms</span><span class="p">,</span> <span class="n">Large</span> <span class="mf">0.012</span> <span class="n">ms</span>
<span class="err"></span> <span class="n">Latency</span> <span class="n">measurement</span> <span class="n">works</span> <span class="n">correctly</span><span class="err">!</span>
</pre></div>
</div>
</section>
<section id="manual-testing-examples">
<h3>Manual Testing Examples<a class="headerlink" href="#manual-testing-examples" title="Link to this heading">#</a></h3>
<div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="kn">from</span><span class="w"> </span><span class="nn">profiling_dev</span><span class="w"> </span><span class="kn">import</span> <span class="n">Profiler</span><span class="p">,</span> <span class="n">quick_profile</span>
<span class="kn">from</span><span class="w"> </span><span class="nn">tinytorch.nn.layers</span><span class="w"> </span><span class="kn">import</span> <span class="n">Linear</span>
<span class="kn">from</span><span class="w"> </span><span class="nn">tinytorch.core.tensor</span><span class="w"> </span><span class="kn">import</span> <span class="n">Tensor</span>
<span class="c1"># Example 1: Profile a simple layer</span>
<span class="n">layer</span> <span class="o">=</span> <span class="n">Linear</span><span class="p">(</span><span class="mi">256</span><span class="p">,</span> <span class="mi">128</span><span class="p">)</span>
<span class="n">input_tensor</span> <span class="o">=</span> <span class="n">Tensor</span><span class="p">(</span><span class="n">np</span><span class="o">.</span><span class="n">random</span><span class="o">.</span><span class="n">randn</span><span class="p">(</span><span class="mi">32</span><span class="p">,</span> <span class="mi">256</span><span class="p">))</span>
<span class="n">profiler</span> <span class="o">=</span> <span class="n">Profiler</span><span class="p">()</span>
<span class="n">profile</span> <span class="o">=</span> <span class="n">profiler</span><span class="o">.</span><span class="n">profile_forward_pass</span><span class="p">(</span><span class="n">layer</span><span class="p">,</span> <span class="n">input_tensor</span><span class="p">)</span>
<span class="nb">print</span><span class="p">(</span><span class="sa">f</span><span class="s2">&quot;Parameters: </span><span class="si">{</span><span class="n">profile</span><span class="p">[</span><span class="s1">&#39;parameters&#39;</span><span class="p">]</span><span class="si">:</span><span class="s2">,</span><span class="si">}</span><span class="s2">&quot;</span><span class="p">)</span>
<span class="nb">print</span><span class="p">(</span><span class="sa">f</span><span class="s2">&quot;FLOPs: </span><span class="si">{</span><span class="n">profile</span><span class="p">[</span><span class="s1">&#39;flops&#39;</span><span class="p">]</span><span class="si">:</span><span class="s2">,</span><span class="si">}</span><span class="s2">&quot;</span><span class="p">)</span>
<span class="nb">print</span><span class="p">(</span><span class="sa">f</span><span class="s2">&quot;Latency: </span><span class="si">{</span><span class="n">profile</span><span class="p">[</span><span class="s1">&#39;latency_ms&#39;</span><span class="p">]</span><span class="si">:</span><span class="s2">.2f</span><span class="si">}</span><span class="s2"> ms&quot;</span><span class="p">)</span>
<span class="nb">print</span><span class="p">(</span><span class="sa">f</span><span class="s2">&quot;Memory: </span><span class="si">{</span><span class="n">profile</span><span class="p">[</span><span class="s1">&#39;peak_memory_mb&#39;</span><span class="p">]</span><span class="si">:</span><span class="s2">.2f</span><span class="si">}</span><span class="s2"> MB&quot;</span><span class="p">)</span>
<span class="nb">print</span><span class="p">(</span><span class="sa">f</span><span class="s2">&quot;Bottleneck: </span><span class="si">{</span><span class="n">profile</span><span class="p">[</span><span class="s1">&#39;bottleneck&#39;</span><span class="p">]</span><span class="si">}</span><span class="s2">&quot;</span><span class="p">)</span>
<span class="c1"># Output:</span>
<span class="c1"># Parameters: 32,896</span>
<span class="c1"># FLOPs: 2,097,152</span>
<span class="c1"># Latency: 0.15 ms</span>
<span class="c1"># Memory: 2.10 MB</span>
<span class="c1"># Bottleneck: memory</span>
<span class="c1"># Example 2: Compare architectures</span>
<span class="n">mlp</span> <span class="o">=</span> <span class="n">Linear</span><span class="p">(</span><span class="mi">512</span><span class="p">,</span> <span class="mi">512</span><span class="p">)</span>
<span class="n">attention</span> <span class="o">=</span> <span class="n">MultiHeadAttention</span><span class="p">(</span><span class="n">d_model</span><span class="o">=</span><span class="mi">512</span><span class="p">,</span> <span class="n">num_heads</span><span class="o">=</span><span class="mi">8</span><span class="p">)</span>
<span class="n">mlp_profile</span> <span class="o">=</span> <span class="n">profiler</span><span class="o">.</span><span class="n">profile_forward_pass</span><span class="p">(</span><span class="n">mlp</span><span class="p">,</span> <span class="n">mlp_input</span><span class="p">)</span>
<span class="n">attention_profile</span> <span class="o">=</span> <span class="n">profiler</span><span class="o">.</span><span class="n">profile_forward_pass</span><span class="p">(</span><span class="n">attention</span><span class="p">,</span> <span class="n">attention_input</span><span class="p">)</span>
<span class="nb">print</span><span class="p">(</span><span class="sa">f</span><span class="s2">&quot;MLP GFLOP/s: </span><span class="si">{</span><span class="n">mlp_profile</span><span class="p">[</span><span class="s1">&#39;gflops_per_second&#39;</span><span class="p">]</span><span class="si">:</span><span class="s2">.2f</span><span class="si">}</span><span class="s2">&quot;</span><span class="p">)</span>
<span class="nb">print</span><span class="p">(</span><span class="sa">f</span><span class="s2">&quot;Attention GFLOP/s: </span><span class="si">{</span><span class="n">attention_profile</span><span class="p">[</span><span class="s1">&#39;gflops_per_second&#39;</span><span class="p">]</span><span class="si">:</span><span class="s2">.2f</span><span class="si">}</span><span class="s2">&quot;</span><span class="p">)</span>
<span class="c1"># Output reveals which operation is more efficient</span>
<span class="c1"># Example 3: Analyze training memory</span>
<span class="n">training_profile</span> <span class="o">=</span> <span class="n">profiler</span><span class="o">.</span><span class="n">profile_backward_pass</span><span class="p">(</span><span class="n">model</span><span class="p">,</span> <span class="n">input_tensor</span><span class="p">)</span>
<span class="nb">print</span><span class="p">(</span><span class="sa">f</span><span class="s2">&quot;Forward memory: </span><span class="si">{</span><span class="n">training_profile</span><span class="p">[</span><span class="s1">&#39;forward_memory_mb&#39;</span><span class="p">]</span><span class="si">:</span><span class="s2">.1f</span><span class="si">}</span><span class="s2"> MB&quot;</span><span class="p">)</span>
<span class="nb">print</span><span class="p">(</span><span class="sa">f</span><span class="s2">&quot;Gradient memory: </span><span class="si">{</span><span class="n">training_profile</span><span class="p">[</span><span class="s1">&#39;gradient_memory_mb&#39;</span><span class="p">]</span><span class="si">:</span><span class="s2">.1f</span><span class="si">}</span><span class="s2"> MB&quot;</span><span class="p">)</span>
<span class="nb">print</span><span class="p">(</span><span class="sa">f</span><span class="s2">&quot;Total training memory: </span><span class="si">{</span><span class="n">training_profile</span><span class="p">[</span><span class="s1">&#39;total_memory_mb&#39;</span><span class="p">]</span><span class="si">:</span><span class="s2">.1f</span><span class="si">}</span><span class="s2"> MB&quot;</span><span class="p">)</span>
<span class="k">for</span> <span class="n">opt_name</span><span class="p">,</span> <span class="n">opt_memory</span> <span class="ow">in</span> <span class="n">training_profile</span><span class="p">[</span><span class="s1">&#39;optimizer_memory_estimates&#39;</span><span class="p">]</span><span class="o">.</span><span class="n">items</span><span class="p">():</span>
<span class="n">total_with_opt</span> <span class="o">=</span> <span class="n">training_profile</span><span class="p">[</span><span class="s1">&#39;total_memory_mb&#39;</span><span class="p">]</span> <span class="o">+</span> <span class="n">opt_memory</span>
<span class="nb">print</span><span class="p">(</span><span class="sa">f</span><span class="s2">&quot;</span><span class="si">{</span><span class="n">opt_name</span><span class="o">.</span><span class="n">upper</span><span class="p">()</span><span class="si">}</span><span class="s2">: </span><span class="si">{</span><span class="n">total_with_opt</span><span class="si">:</span><span class="s2">.1f</span><span class="si">}</span><span class="s2"> MB total&quot;</span><span class="p">)</span>
<span class="c1"># Output:</span>
<span class="c1"># Forward memory: 2.1 MB</span>
<span class="c1"># Gradient memory: 2.0 MB</span>
<span class="c1"># Total training memory: 4.1 MB</span>
<span class="c1"># SGD: 4.1 MB total</span>
<span class="c1"># ADAM: 8.1 MB total (2× extra for momentum + velocity)</span>
</pre></div>
</div>
</section>
</section>
<section id="systems-thinking-questions">
<h2>Systems Thinking Questions<a class="headerlink" href="#systems-thinking-questions" title="Link to this heading">#</a></h2>
<section id="real-world-applications">
<h3>Real-World Applications<a class="headerlink" href="#real-world-applications" title="Link to this heading">#</a></h3>
<ul class="simple">
<li><p><strong>Google TPU Optimization</strong>: Profile every kernel to achieve 40-50% MFU (Model FLOPs Utilization). Google improved T5 training from 35% to 48% MFU through profiling-guided optimization, saving millions in compute costs at scale across thousands of TPUs. How would you use profiling to identify and fix utilization bottlenecks?</p></li>
<li><p><strong>OpenAI GPT Training</strong>: Profile forward and backward passes separately to measure memory usage across parameters, activations, gradients, and optimizer state. OpenAI identified activation memory as the bottleneck and implemented gradient checkpointing, reducing memory by 10× with only 20% compute overhead while achieving 50%+ MFU. What trade-offs exist between recomputation time and storage memory?</p></li>
<li><p><strong>Meta PyTorch Inference</strong>: Profile operator-by-operator timelines to measure kernel launch overhead and identify operator fusion opportunities. Meta reduced inference latency by 2-3× through operator fusion and optimized p99 latency for billions of daily requests serving Facebook/Instagram recommendations. Why optimize for latency percentiles rather than average?</p></li>
<li><p><strong>NVIDIA cuDNN Development</strong>: Use Nsight profiler to analyze warp occupancy, register pressure, and memory bandwidth utilization to achieve 90%+ of theoretical peak performance. NVIDIAs profiling data guides both kernel optimization and next-generation hardware design (H100 architecture). How do you distinguish compute-bound from memory-bound kernels?</p></li>
</ul>
</section>
<section id="profiling-foundations">
<h3>Profiling Foundations<a class="headerlink" href="#profiling-foundations" title="Link to this heading">#</a></h3>
<ul class="simple">
<li><p><strong>Amdahls Law and ROI</strong>: If attention takes 70% of time and you achieve 2× speedup on attention only, overall speedup is just 1.53× (not 2×) because unoptimized portions limit gains. Why does this mean optimization is iterative—requiring re-profiling after each change to identify new bottlenecks?</p></li>
<li><p><strong>Memory Bandwidth Bottlenecks</strong>: An elementwise ReLU operation on 1B elements achieves only 112 GFLOPs/s despite 100 TFLOPS peak compute (0.11% utilization) because its memory-bound (8.89 ms to move 8 GB data vs 0.01 ms to compute). What optimization strategies help memory-bound operations vs compute-bound operations?</p></li>
<li><p><strong>Statistical Timing Methodology</strong>: Single measurements include system noise (OS scheduling, thermal throttling, cache effects). Proper profiling uses warmup runs (JIT compilation, cache warming), multiple measurements (100+ iterations), and reports median (robust to outliers) plus p95/p99 percentiles (tail latency). Why does mean latency hide outliers that affect user experience?</p></li>
<li><p><strong>Profiling Overhead Trade-offs</strong>: Instrumentation profiling (15% overhead) provides precise per-operation timing but distorts fast operations, while sampling profiling (2% overhead) enables always-on production monitoring but may miss operations &lt;1 ms. When should you choose instrumentation vs sampling profilers?</p></li>
</ul>
</section>
<section id="performance-characteristics">
<h3>Performance Characteristics<a class="headerlink" href="#performance-characteristics" title="Link to this heading">#</a></h3>
<ul class="simple">
<li><p><strong>Batch Size Scaling</strong>: Throughput doesnt scale linearly with batch size due to fixed overhead (kernel launch amortizes), memory bandwidth saturation (transfers dominate at large batches), and memory constraints (OOM limits maximum batch size). For a system showing 200→667→914→985 samples/s at batch sizes 1→8→32→64, whats the optimal batch size for throughput vs efficiency vs latency?</p></li>
<li><p><strong>GPU vs CPU Crossover</strong>: Small matrices (128×128) run faster on CPU despite GPUs 1000× more cores because GPU overhead (1 ms kernel launch) dominates compute time. Large matrices (4096×4096) achieve 267× GPU speedup because overhead amortizes and parallelism saturates GPU cores. Whats the crossover point and why does PyTorch automatically dispatch based on operation size?</p></li>
<li><p><strong>Parameter vs Activation Memory</strong>: Training memory = Parameters + Activations + Gradients + Optimizer State. For GPT-2 Small (124M params = 496 MB), total training memory is 2.18 GB (4.4× parameter memory) due to activations (200 MB), gradients (496 MB), and Adam state (992 MB = 2× parameters). Which component should you optimize for different memory constraints?</p></li>
<li><p><strong>FLOPs vs Latency</strong>: Theoretical FLOPs predict compute cost hardware-independently, but actual latency depends on memory bandwidth and kernel efficiency. A GPT-2 feedforward layer requires 154 GFLOPs, suggesting 0.5 ms on A100 (312 TFLOPS), but actual time is higher due to memory overhead. Why is profiling real hardware essential despite theoretical calculations?</p></li>
</ul>
</section>
</section>
<section id="ready-to-build">
<h2>Ready to Build?<a class="headerlink" href="#ready-to-build" title="Link to this heading">#</a></h2>
<p>Youre about to implement the profiling tools that enable all subsequent optimization work. These techniques transform research models into production systems by revealing exactly where time and memory go.</p>
<p><strong>What youll achieve</strong>:</p>
<ul class="simple">
<li><p>Understand where compute time actually goes in ML models (measure, dont guess)</p></li>
<li><p>Distinguish memory-bound from compute-bound operations (guides optimization strategy)</p></li>
<li><p>Make data-driven optimization decisions using Amdahls Law (maximize ROI on engineering time)</p></li>
<li><p>Build the measurement foundation for Modules 15-20 (optimization techniques)</p></li>
</ul>
<p><strong>The profiling mindset</strong>:</p>
<blockquote>
<div><p>“Measure twice, optimize once. Profile before every optimization decision. Without measurement, youre flying blind.”
— Every production ML engineer</p>
</div></blockquote>
<p>Choose your preferred way to engage with this module:</p>
<div class="sd-container-fluid sd-sphinx-override sd-mb-4 docutils">
<div class="sd-row sd-row-cols-1 sd-row-cols-xs-1 sd-row-cols-sm-2 sd-row-cols-md-3 sd-row-cols-lg-3 docutils">
<div class="sd-col sd-d-flex-row docutils">
<div class="sd-card sd-sphinx-override sd-w-100 sd-shadow-sm sd-card-hover docutils">
<div class="sd-card-body docutils">
<div class="sd-card-title sd-font-weight-bold docutils">
🚀 Launch Binder</div>
<p class="sd-card-text">Run this module interactively in your browser. No installation required!</p>
</div>
<a class="sd-stretched-link sd-hide-link-text reference external" href="https://mybinder.org/v2/gh/mlsysbook/TinyTorch/main?filepath=modules/14_profiling/profiling_dev.ipynb"><span>https://mybinder.org/v2/gh/mlsysbook/TinyTorch/main?filepath=modules/14_profiling/profiling_dev.ipynb</span></a></div>
</div>
<div class="sd-col sd-d-flex-row docutils">
<div class="sd-card sd-sphinx-override sd-w-100 sd-shadow-sm sd-card-hover docutils">
<div class="sd-card-body docutils">
<div class="sd-card-title sd-font-weight-bold docutils">
⚡ Open in Colab</div>
<p class="sd-card-text">Use Google Colab for cloud compute power and easy sharing.</p>
</div>
<a class="sd-stretched-link sd-hide-link-text reference external" href="https://colab.research.google.com/github/mlsysbook/TinyTorch/blob/main/modules/14_profiling/profiling_dev.ipynb"><span>https://colab.research.google.com/github/mlsysbook/TinyTorch/blob/main/modules/14_profiling/profiling_dev.ipynb</span></a></div>
</div>
<div class="sd-col sd-d-flex-row docutils">
<div class="sd-card sd-sphinx-override sd-w-100 sd-shadow-sm sd-card-hover docutils">
<div class="sd-card-body docutils">
<div class="sd-card-title sd-font-weight-bold docutils">
📖 View Source</div>
<p class="sd-card-text">Browse the Python source code and understand the implementation.</p>
</div>
<a class="sd-stretched-link sd-hide-link-text reference external" href="https://github.com/mlsysbook/TinyTorch/blob/main/modules/14_profiling/profiling_dev.py"><span>https://github.com/mlsysbook/TinyTorch/blob/main/modules/14_profiling/profiling_dev.py</span></a></div>
</div>
</div>
</div>
<div class="tip admonition">
<p class="admonition-title">💾 Save Your Progress</p>
<p><strong>Binder sessions are temporary!</strong> Download your completed notebook when done, or switch to local development for persistent work.</p>
</div>
<div class="highlight-bash notranslate"><div class="highlight"><pre><span></span><span class="nb">cd</span><span class="w"> </span>modules/14_profiling
tito<span class="w"> </span>module<span class="w"> </span>start<span class="w"> </span><span class="m">14</span>
python<span class="w"> </span>profiling_dev.py<span class="w"> </span><span class="c1"># Inline tests as you build</span>
</pre></div>
</div>
<hr class="docutils" />
<div class="prev-next-area">
<a class="left-prev" href="../13_transformers/ABOUT.html" title="previous page">← Module 13: Transformers</a>
<a class="right-next" href="../15_quantization/ABOUT.html" title="next page">Module 15: Quantization →</a>
</div>
</section>
</section>
<script type="text/x-thebe-config">
{
requestKernel: true,
binderOptions: {
repo: "binder-examples/jupyter-stacks-datascience",
ref: "master",
},
codeMirrorConfig: {
theme: "abcdef",
mode: "python"
},
kernelOptions: {
name: "python3",
path: "./modules"
},
predefinedOutput: true
}
</script>
<script>kernelName = 'python3'</script>
</article>
<footer class="prev-next-footer d-print-none">
<div class="prev-next-area">
<a class="left-prev"
href="../tiers/optimization.html"
title="previous page">
<i class="fa-solid fa-angle-left"></i>
<div class="prev-next-info">
<p class="prev-next-subtitle">previous</p>
<p class="prev-next-title">⏱️ Optimization Tier (Modules 14-19)</p>
</div>
</a>
<a class="right-next"
href="15_quantization_ABOUT.html"
title="next page">
<div class="prev-next-info">
<p class="prev-next-subtitle">next</p>
<p class="prev-next-title">15. Quantization - Reduced Precision for Efficiency</p>
</div>
<i class="fa-solid fa-angle-right"></i>
</a>
</div>
</footer>
</div>
<div class="bd-sidebar-secondary bd-toc"><div class="sidebar-secondary-items sidebar-secondary__inner">
<div class="sidebar-secondary-item">
<div class="page-toc tocsection onthispage">
<i class="fa-solid fa-list"></i> Contents
</div>
<nav class="bd-toc-nav page-toc">
<ul class="visible nav section-nav flex-column">
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#overview">Overview</a></li>
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#why-this-matters">Why This Matters</a><ul class="nav section-nav flex-column">
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#production-context-profiling-drives-optimization-economics">Production Context: Profiling Drives Optimization Economics</a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#historical-evolution-from-ad-hoc-timing-to-systematic-measurement">Historical Evolution: From Ad-Hoc Timing to Systematic Measurement</a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#what-youll-actually-build">What Youll Actually Build</a></li>
</ul>
</li>
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#learning-objectives">Learning Objectives</a></li>
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#build-use-reflect">Build → Use → Reflect</a></li>
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#implementation-guide">Implementation Guide</a><ul class="nav section-nav flex-column">
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#core-component-profiler-class">Core Component: Profiler Class</a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#parameter-counting-memory-footprint-analysis">Parameter Counting: Memory Footprint Analysis</a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#flop-counting-computational-cost-analysis">FLOP Counting: Computational Cost Analysis</a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#memory-profiling-understanding-allocation-patterns">Memory Profiling: Understanding Allocation Patterns</a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#latency-measurement-statistical-timing-methodology">Latency Measurement: Statistical Timing Methodology</a></li>
</ul>
</li>
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#getting-started">Getting Started</a><ul class="nav section-nav flex-column">
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#prerequisites">Prerequisites</a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#development-workflow">Development Workflow</a></li>
</ul>
</li>
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#testing">Testing</a><ul class="nav section-nav flex-column">
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#comprehensive-test-suite">Comprehensive Test Suite</a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#test-coverage-areas">Test Coverage Areas</a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#inline-testing-validation">Inline Testing &amp; Validation</a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#manual-testing-examples">Manual Testing Examples</a></li>
</ul>
</li>
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#systems-thinking-questions">Systems Thinking Questions</a><ul class="nav section-nav flex-column">
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#real-world-applications">Real-World Applications</a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#profiling-foundations">Profiling Foundations</a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#performance-characteristics">Performance Characteristics</a></li>
</ul>
</li>
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#ready-to-build">Ready to Build?</a></li>
</ul>
</nav></div>
</div></div>
</div>
<footer class="bd-footer-content">
<div class="bd-footer-content__inner container">
<div class="footer-item">
<p class="component-author">
By Prof. Vijay Janapa Reddi (Harvard University)
</p>
</div>
<div class="footer-item">
<p class="copyright">
© Copyright 2025.
<br/>
</p>
</div>
<div class="footer-item">
</div>
<div class="footer-item">
</div>
</div>
</footer>
</main>
</div>
</div>
<!-- Scripts loaded after <body> so the DOM is not blocked -->
<script src="../_static/scripts/bootstrap.js?digest=dfe6caa3a7d634c4db9b"></script>
<script src="../_static/scripts/pydata-sphinx-theme.js?digest=dfe6caa3a7d634c4db9b"></script>
<footer class="bd-footer">
</footer>
</body>
</html>