Files
cs249r_book/mlsysim/tutorial/slides/tutorial_part2.tex

1183 lines
37 KiB
TeX
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
% =============================================================================
% MLSys·im Tutorial Tutorial — Parts 59 (Afternoon Session)
% =============================================================================
\documentclass[aspectratio=169, 12pt]{beamer}
\usepackage{../../../slides/assets/beamerthememlsys}
\mlsyssetup{
volume = {Tutorial},
chapter = {Tutorial},
logo = {../../../slides/assets/img/logo-mlsysbook.png},
instlogo = {../../../slides/assets/img/logo-harvard.png},
chaptertitle = {MLSys·im — Afternoon Session},
}
% --- Fonts ---
\usepackage{fontspec}
\setsansfont{Helvetica Neue}[
BoldFont={Helvetica Neue Bold},
ItalicFont={Helvetica Neue Italic},
BoldItalicFont={Helvetica Neue Bold Italic},
]
% Use Courier if JetBrains Mono not available
\IfFontExistsTF{JetBrains Mono}{
\setmonofont{JetBrains Mono}[Scale=0.85]
}{
\setmonofont{Courier New}[Scale=0.90]
}
% --- Packages ---
\usepackage{amsmath}
\usepackage{booktabs}
\usepackage[table]{xcolor}
\usepackage{listings}
\usepackage{tikz}
% --- Code listings ---
\lstset{
language=Python,
basicstyle=\ttfamily\footnotesize,
keywordstyle=\color{crimson}\bfseries,
stringstyle=\color{datastroke},
commentstyle=\color{midgray}\itshape,
backgroundcolor=\color{computeblue!20},
frame=single,
rulecolor=\color{computestroke},
numbers=none,
breaklines=true,
columns=fullflexible,
keepspaces=true,
showstringspaces=false,
xleftmargin=4pt,
xrightmargin=4pt,
aboveskip=6pt,
belowskip=4pt,
}
% --- Convenience macros ---
\newcommand{\mlsysim}{\texttt{mlsysim}}
% --- Image paths ---
\graphicspath{{images/}}
% --- Section count (must match actual \section{} count) ---
\setcounter{mlsystotalsections}{5}
\title{MLSys·im Tutorial --- Afternoon Session}
\subtitle{Parts 5--9: Economics, DSE, TinyML, Advanced, Wrap-up}
\author{Vijay Janapa Reddi}
\institute{Harvard University}
\date{Tutorial}
\begin{document}
% --- Roadmap: After Break ---
\begin{frame}{Roadmap: You Are Here}
\note{[1 min] Quick orientation after break.}
\centering\small
\begin{tabular}{rll}
\toprule
\textbf{Time} & \textbf{Part} & \textbf{Status} \\
\midrule
1:00--2:15 & Part 4: Going Distributed & \checkmark \\
\rowcolor{crimson!12}
2:30--3:15 & \textbf{Part 5: Economics \& Sustainability} & \textbf{$\leftarrow$ You are here} \\
3:15--3:45 & Part 6: Design Space Exploration & \\
3:45--4:15 & Part 7: TinyML to Frontier & \\
4:15--4:45 & Part 8: Advanced Topics & \\
4:45--5:00 & Part 9: Wrap-Up \& Capstone & \\
\bottomrule
\end{tabular}
\end{frame}
% =====================================================================
% PART 5: ECONOMICS & SUSTAINABILITY
% =====================================================================
\section{Economics \& Sustainability}
% --- 5.1 Key Question ---
\begin{frame}{Key Question}
\note{[1 min] Open with the dollar question. Pause dramatically.}
\begin{center}
\Large\bfseries
How much does it \emph{really} cost to train and serve an LLM---\\[4pt]
in dollars, kilowatt-hours, and tonnes of CO$_2$?
\end{center}
\vfill
\begin{quotation}
\small\itshape
``A 1024-GPU H100 cluster costs \$30M in hardware. Electricity is
surprisingly small ($\sim$10\% of total). The dominant cost lever is
utilization---idle GPUs cost the same as busy ones.''
\end{quotation}
\end{frame}
% --- 5.2 The TCO Equation ---
\begin{frame}[fragile]{Wall 17: The Capital Wall (TCO)}
\note{[3 min] Run the TCO code live. Key surprise: GPUs are only 40\% of total cost.}
\begin{columns}[T]
\column{0.55\textwidth}
\begin{block}{The Equation}
\[
\text{TCO} = \underbrace{\text{CapEx}}_{\text{hw + facility}}
+ \underbrace{\text{OpEx}_{\text{energy}}}_{\text{electricity}}
+ \underbrace{\text{OpEx}_{\text{maint}}}_{\text{staff}}
\]
\end{block}
\vspace{0.3em}
\small
\textbf{Key insight:} GPUs are only $\sim$40\% of total CapEx.\\
Networking, cooling, facility, and staff add 50--150\%.
\column{0.42\textwidth}
\begin{lstlisting}
econ = mlsysim.EconomicsModel()
r = econ.solve(
fleet=mlsysim.Systems
.Clusters.Research_256,
duration_days=30,
infrastructure_multiplier=2.5)
print(f"TCO: ${r.tco_usd:,.0f}")
\end{lstlisting}
\end{columns}
\vspace{0.3em}
\small
\begin{itemize}\setlength\itemsep{1pt}
\item \texttt{infrastructure\_multiplier = 2.0--2.5} for full datacenter TCO
\item Amortized over 3--5 year depreciation schedule
\item Cloud spot instances can beat on-prem for bursty workloads
\end{itemize}
\end{frame}
% --- 5.3 Infrastructure Multiplier ---
\begin{frame}{The Infrastructure Multiplier}
\note{[2 min] Walk through the cost breakdown table. Emphasize: budgeting only for GPUs underestimates by 2-3x.}
\centering
\begin{tabular}{lrr}
\toprule
\textbf{Component} & \textbf{Cost (\%)} & \textbf{Multiplier} \\
\midrule
GPU Hardware & 40\% & 1.0$\times$ \\
Network (IB) & 20\% & --- \\
Power \& Cooling & 15\% & --- \\
Facility / Land & 10\% & --- \\
Staff / Operations & 15\% & --- \\
\midrule
\textbf{Total} & 100\% & \textbf{2.5$\times$} \\
\bottomrule
\end{tabular}
\vspace{1em}
\alert{Pitfall:} Budgeting only for GPUs underestimates TCO by 2--3$\times$.
\end{frame}
% --- 5.4 The Sustainability Equation ---
\begin{frame}[fragile]{Wall 18: The Sustainability Wall}
\note{[3 min] Three levers: energy, PUE, carbon intensity. Geography dominates.}
\begin{columns}[T]
\column{0.55\textwidth}
\begin{block}{The Equation}
\small
\[
\text{CO}_2 = \text{Energy} \times \text{PUE} \times \text{CI}
\]
\end{block}
\vspace{0.3em}
\small
\textbf{Three levers:}
\begin{enumerate}\setlength\itemsep{1pt}
\item \textbf{Energy} --- Better MFU, smaller model
\item \textbf{PUE} --- Liquid cooling (1.3 $\to$ 1.06)
\item \textbf{Carbon intensity} --- Choose your grid
\end{enumerate}
\column{0.42\textwidth}
\begin{lstlisting}
sus = mlsysim.SustainabilityModel()
r = sus.solve(
fleet=mlsysim.Systems
.Clusters.Research_256,
datacenter=mlsysim.Infra
.Grids.Quebec,
duration_days=30, mfu=0.45)
print(f"{r.carbon_footprint_kg
/1000:.1f} tonnes CO2")
\end{lstlisting}
\end{columns}
\vspace{0.3em}
\begin{center}
\alert{Geography is the single biggest lever for sustainable AI.}
\end{center}
\end{frame}
% --- 5.5 Geography Matters ---
\begin{frame}{Carbon Intensity: The 41$\times$ Gap}
\note{[2 min] Show the table. Poland vs Quebec: 41x difference. Same energy, different grid.}
\begin{columns}[T]
\column{0.45\textwidth}
\centering
\begin{tabular}{lcr}
\toprule
\textbf{Region} & \textbf{Mix} & \textbf{gCO$_2$} \\
\midrule
Quebec & Hydro & 20 \\
Sweden & Hydro+Nuc & 25 \\
US Avg & Mixed & 390 \\
Germany & Coal+Wind & 380 \\
Poland & Coal & 820 \\
\bottomrule
\end{tabular}
\vspace{0.5em}
\small
Training the \emph{same model}:\\
\textbf{Poland:} $\sim$800\,t\quad vs\quad \textbf{Quebec:} $\sim$20\,t\\[4pt]
$\Rightarrow$ \textbf{41$\times$} difference.
\column{0.52\textwidth}
\centering
\includegraphics[width=0.85\textwidth]{images/pdf/carbon-geography.pdf}
\end{columns}
\end{frame}
% --- 5.6 Embodied Carbon ---
\begin{frame}{Embodied Carbon: Manufacturing Dominates at the Edge}
\note{[2 min] Key insight: at cloud scale, operational carbon dominates. At TinyML scale, embodied carbon dominates.}
\begin{columns}[T]
\column{0.5\textwidth}
\textbf{Cloud / Training}
\begin{itemize}
\item Operational carbon dominates
\item GPU runs 24/7 for months
\item Embodied: $\sim$5\% of lifecycle
\end{itemize}
\column{0.5\textwidth}
\textbf{IoT / TinyML}
\begin{itemize}
\item Manufacturing carbon dominates
\item MCU uses microwatts
\item Embodied: $>$90\% of lifecycle
\item Multiplied by millions of devices
\end{itemize}
\end{columns}
\vspace{1em}
\begin{center}
\small
Source: Gupta et al.\ (2022), ``ACT: Designing Sustainable Computer\\
Systems with an Architectural Carbon Modeling Tool''
\end{center}
\end{frame}
% --- 5.7 Live Demo: Economics ---
\begin{frame}[fragile]{Live Demo: TCO + Carbon Analysis}
\note{[3 min] Run live. Show TCO and carbon side by side.}
\begin{lstlisting}
fleet = mlsysim.Systems.Clusters.Research_256
econ = mlsysim.EconomicsModel()
result = econ.solve(
fleet=fleet, duration_days=30,
datacenter=mlsysim.Infra.Grids.Quebec,
mfu=0.45, infrastructure_multiplier=2.5,
amortization_years=3.0)
print(f"TCO: ${result.tco_usd:,.0f}")
print(f"Carbon: {result.carbon_footprint_kg/1000:.1f} t")
\end{lstlisting}
\end{frame}
% --- 5.8 Live Demo: Geography Comparison ---
\begin{frame}[fragile]{Live Demo: The Carbon Geography Experiment}
\note{[2 min] Run live. Same energy, 41x less carbon. Geography wins.}
\begin{lstlisting}
fleet = mlsysim.Systems.Clusters.Research_256
solver = mlsysim.SustainabilityModel()
for region in [mlsysim.Infra.Grids.Poland,
mlsysim.Infra.Grids.Quebec]:
r = solver.solve(fleet=fleet, duration_days=30,
datacenter=region, mfu=0.45)
print(f"{r.region_name:>12}: "
f"{r.carbon_footprint_kg/1000:.1f} t CO2")
\end{lstlisting}
\vspace{0.5em}
\begin{exampleblock}{Expected Output}
\ttfamily\scriptsize
\hspace{1em}Poland: 1,064 MWh, 872.5 tonnes CO2\\
\hspace{1em}Quebec: 1,064 MWh, \phantom{0}21.3 tonnes CO2
\end{exampleblock}
\vspace{0.3em}
\alert{Same energy. 41$\times$ less carbon. Geography wins.}
\end{frame}
% --- 5.9 Exercise Part A: Carbon-Aware Placement ---
\begin{frame}[fragile]{Exercise 5a: Carbon-Aware Placement}
\note{[3 min] Expected answer: $\sim$850 tonnes CO$_2$ saved.
The energy is identical ($\sim$2{,}100 MWh)---only carbon intensity differs.
Key insight: the cheapest carbon reduction is a \texttt{git push} to a different region.
Turn to your neighbor: did you get the same answer? Why or why not? 60 seconds.}
\begin{alertblock}{Your Task (3 minutes)}
Compare a 30-day training run on 512 H100s:
\textbf{Poland} vs \textbf{Quebec}. How many tonnes of CO$_2$ saved?
\end{alertblock}
\begin{lstlisting}
fleet = mlsysim.Fleet(
name="Training Fleet",
node=mlsysim.Systems.Nodes.DGX_H100,
count=64,
fabric=mlsysim.Systems.Fabrics.InfiniBand_NDR)
solver = mlsysim.SustainabilityModel()
# YOUR CODE: solve for Poland and Quebec
\end{lstlisting}
\end{frame}
% --- 5.9b Exercise Part B: Multi-Vendor TCO Shootout ---
\begin{frame}[fragile]{Exercise 5b: Multi-Vendor TCO Shootout}
\note{[5 min] THE ISCA QUESTION. Which vendor gives the cheapest training?
Expected: MI300X cluster is cheapest per FLOP (192 GB HBM means fewer
nodes needed for large models), but H100 has the best software ecosystem.
Gaudi\,3 is competitive on raw cost. The answer depends on model size.
Turn to your neighbor: did you get the same answer? Why or why not? 60 seconds.}
\begin{alertblock}{Your Task (5 minutes)}
Same workload: Llama-3 70B, 30-day training run, 512 accelerators.\\
Which cluster is cheapest for the \textbf{same throughput}?
\end{alertblock}
\begin{lstlisting}
econ = mlsysim.EconomicsModel()
for cluster_name in ["H100", "MI300X", "Gaudi3"]:
hw = getattr(mlsysim.Hardware.Cloud, cluster_name)
fleet = mlsysim.Fleet(name=cluster_name, hw=hw,
count=64)
r = econ.solve(fleet=fleet, duration_days=30,
infrastructure_multiplier=2.5)
print(f"{cluster_name:>8}: TCO=${r.tco_usd:,.0f}")
\end{lstlisting}
\small
\alert{The cheapest accelerator depends on the workload's binding constraint.}
\end{frame}
% --- 5.10 Takeaway ---
\begin{frame}{Key Takeaway: Economics \& Sustainability}
\note{[1 min] Three points, repeat each.}
\begin{center}
\Large
\begin{enumerate}
\item \textbf{TCO $\neq$ GPU cost.} Infrastructure is 2--2.5$\times$.
\item \textbf{Utilization is the cost lever.} Idle GPUs cost the same.
\item \textbf{Geography $>$ algorithms} for carbon reduction.
\end{enumerate}
\vspace{1em}
\normalsize
\textit{``The cheapest watt is the one you don't consume.\\
The cleanest watt is the one from a hydro dam.''}
\end{center}
\end{frame}
% =====================================================================
% PART 6: DESIGN SPACE EXPLORATION
% =====================================================================
\section{Design Space Exploration}
% --- 6.1 Key Question ---
\begin{frame}{Key Question}
\note{[1 min] Frame the search problem. The space is combinatorial.}
\begin{center}
\Large\bfseries
Given a budget and an SLA,\\[4pt]
how do you find the \emph{best} hardware configuration?
\end{center}
\vfill
\small
The search space is combinatorial:
$|\text{hardware}| \times |\text{batch sizes}| \times |\text{precisions}| \times |\text{parallelism configs}|$\\
can easily exceed $10^4$ configurations.
\end{frame}
% --- 6.2 The DSE Pattern ---
\begin{frame}[fragile]{The DSE Pattern: Declare, Search, Rank}
\note{[3 min] Three-step pattern. Emphasize: analytical models make exhaustive search feasible.}
\begin{columns}[T]
\column{0.5\textwidth}
\begin{enumerate}
\item \textbf{Declare} the search space:
\begin{itemize}\footnotesize
\item Hardware: \{A100, H100, H200, B200\}
\item Batch sizes: \{1, 2, 4, \ldots, 64\}
\item Precisions: \{fp16, int8, int4\}
\end{itemize}
\item \textbf{Search} with an objective:
\begin{itemize}\footnotesize
\item \texttt{minimize: tco\_usd}
\item \texttt{maximize: throughput}
\end{itemize}
\item \textbf{Rank} subject to constraints:
\begin{itemize}\footnotesize
\item \texttt{latency < 50 ms}
\item \texttt{feasible == True}
\end{itemize}
\end{enumerate}
\column{0.47\textwidth}
\begin{lstlisting}
from mlsysim.core.dse import DSE
dse = DSE(
space={
"batch_size": [1,4,8,16,32],
"precision": ["fp16","int8"],
},
objective="maximize: throughput",
constraints=["latency < 100"])
\end{lstlisting}
\vspace{0.3em}
\small
Analytical models $\Rightarrow$ each evaluation\\
takes $<$1\,ms $\Rightarrow$ exhaustive search.
\end{columns}
\end{frame}
% --- 6.3 Pareto Fronts ---
\begin{frame}{Pareto Fronts: No Free Lunch}
\note{[2 min] Explain the Pareto front. The knee is usually the sweet spot.}
\centering
\includegraphics[width=0.65\textwidth]{images/pdf/pareto-front.pdf}
\vspace{0.5em}
\textbf{Pareto front}: the set of configurations where\\
improving one metric \emph{must} worsen another.
\begin{itemize}
\item Lower latency $\Rightarrow$ lower throughput (smaller batch)
\item Higher throughput $\Rightarrow$ higher cost (more GPUs)
\item The ``knee'' of the curve is usually the sweet spot
\end{itemize}
\end{frame}
% --- 6.4 Live Demo: Engine.sweep ---
\begin{frame}[fragile]{Live Demo: Design Space Sweep}
\note{[3 min] Run live. Note multi-vendor hardware list: H100, MI300X, Gaudi3, B200.}
\begin{lstlisting}
hw_list = [mlsysim.Hardware.Cloud.H100,
mlsysim.Hardware.Cloud.MI300X,
mlsysim.Hardware.Cloud.Gaudi3,
mlsysim.Hardware.Cloud.B200]
results = mlsysim.Engine.sweep(
llama, hw_list, batch_sizes=[1, 4, 16, 64],
precisions=["fp16", "int8"], efficiency=0.5)
for r in sorted(results,
key=lambda x: x['profile'].latency.magnitude):
p = r['profile']
print(f"{r['hardware']:>8} bs={r['batch_size']:<3} "
f"| {p.bottleneck:<8} | MFU={p.mfu:.3f}")
\end{lstlisting}
\end{frame}
% --- 6.5 Live Demo: DSE with Constraints ---
\begin{frame}[fragile]{Live Demo: DSE with Objective \& Constraints}
\note{[3 min] Run live. Show maximize throughput subject to latency constraint.}
\begin{lstlisting}
from mlsysim.core.dse import DSE
dse = DSE(
space={"batch_size": [1,4,8,16,32,64],
"precision": ["fp16", "int8"]},
objective="maximize: throughput",
constraints=["latency < 100"])
def evaluate(params):
p = mlsysim.Engine.solve(llama, hw,
batch_size=params["batch_size"],
precision=params["precision"])
return {"throughput": p.throughput.magnitude,
"latency": p.latency.to("ms").magnitude}
best = dse.search(evaluate)
print(f"Best: {best['best_params']}")
\end{lstlisting}
\end{frame}
% --- 6.6 Batching Optimizer ---
\begin{frame}[fragile]{Live Demo: Batching Optimizer (Pareto Front)}
\note{[3 min] Run live. Show Pareto front of batch size vs latency.}
\begin{lstlisting}
opt = mlsysim.BatchingOptimizer()
result = opt.solve(
model=llama, hardware=hw, seq_len=128,
arrival_rate_qps=10.0,
sla_latency_ms=20000.0, precision="fp16")
print(f"Optimal bs: {result.best_batch_size}")
for pt in result.pareto_front:
print(f" bs={pt['batch_size']:<4} "
f"lat={pt['p99_latency'].m_as('ms'):.0f} ms")
\end{lstlisting}
\end{frame}
% --- 6.7 Exercise ---
\begin{frame}[fragile]{Exercise: Budget-Constrained Design}
\begin{alertblock}{Your Task (5 minutes)}
Given a \textbf{\$1M budget}, find the best hardware + batch size + precision
configuration for serving Llama-3-8B inference under a 50\,ms latency SLA.
\end{alertblock}
\begin{lstlisting}
# Hint: Use Engine.sweep across hardware tiers,
# filter by feasibility and latency SLA,
# then pick the highest-throughput config
# within budget (check unit_cost in Hardware)
\end{lstlisting}
\note{[5 min] Expected answer: H100 at INT8, batch size 16--32 gives the best
throughput/dollar under 50\,ms. A100 at FP16 fails the SLA at high batch sizes.
Turn to your neighbor: did you get the same answer? Why or why not? 60 seconds.}
\end{frame}
% --- 6.8 Takeaway ---
\begin{frame}{Key Takeaway: Design Space Exploration}
\note{[1 min] Three points, repeat each.}
\begin{center}
\Large
\begin{enumerate}
\item \textbf{Analytical models} make exhaustive DSE feasible ($<$1\,s for $10^4$ configs)
\item \textbf{Pareto fronts} reveal the actual tradeoff structure
\item \textbf{Constraints first}: filter infeasible, then optimize
\end{enumerate}
\vspace{1em}
\normalsize
\textit{``You cannot optimize what you cannot model.\\
You cannot model what you cannot measure.''}
\end{center}
\end{frame}
% =====================================================================
% PART 7: TINYML TO FRONTIER
% =====================================================================
\section{TinyML to Frontier}
% --- 7.1 Key Question ---
\begin{frame}{Key Question}
\note{[1 min] ``Same equation, 9 orders of magnitude apart.''}
\begin{center}
\Large\bfseries
Can the same analytical framework model\\[4pt]
a \$2 microcontroller \emph{and} a \$3M GPU rack?
\end{center}
\vfill
\small
The answer is yes---because the \textbf{Roofline model} is universal.\\
Only the numbers change. The physics is the same.
\end{frame}
% --- 7.2 The Nine Orders of Magnitude ---
\begin{frame}{The 9-Order-of-Magnitude Scale Span}
\note{[2 min] Walk through the table. Compute spans 10\^7x, power 10\^4.7x.}
\begin{columns}[T]
\column{0.52\textwidth}
\centering
\includegraphics[width=0.85\textwidth]{images/pdf/hardware-spectrum.pdf}
\column{0.45\textwidth}
\scriptsize
\begin{tabular}{lrr}
\toprule
\textbf{Device} & \textbf{FLOPS} & \textbf{TDP} \\
\midrule
nRF52840 & 64\,M & 15\,mW \\
ESP32-S3 & 500\,M & 400\,mW \\
\rowcolor{gray!15}
H100 SXM & 989\,T & 700\,W \\
\bottomrule
\end{tabular}
\vspace{0.5em}
\begin{tabular}{lr}
\textbf{Compute} & $\sim 10^{7}\times$ \\
\textbf{Mem BW} & $\sim 10^{7}\times$ \\
\textbf{Power} & $\sim 10^{4.7}\times$ \\
\end{tabular}
\end{columns}
\vspace{0.5em}
\centering
\alert{Same Roofline equation. Nine orders of magnitude apart.}
\end{frame}
% --- 7.3 TinyML Memory Hierarchy ---
\begin{frame}{Flash vs SRAM: The TinyML Memory Wall}
\note{[3 min] Key difference: TinyML has Flash (8 MB, 80 MB/s) vs Cloud HBM (80 GB, 3.35 TB/s).}
\begin{columns}[T]
\column{0.5\textwidth}
\textbf{Cloud GPU (H100)}
\begin{itemize}
\item Weights in HBM (80\,GB)
\item Activations in SRAM (50\,MB L2)
\item Bandwidth: 3.35\,TB/s
\item Model $\leq$ 80\,GB $\Rightarrow$ fits
\end{itemize}
\column{0.5\textwidth}
\textbf{TinyML MCU (ESP32-S3)}
\begin{itemize}
\item Weights in \textbf{Flash} (8\,MB)
\item Activations in \textbf{SRAM} (512\,KiB)
\item Flash BW: 80\,MB/s
\item Model $\leq$ 8\,MB $\Rightarrow$ fits
\item Model $\leq$ 512\,KiB $\Rightarrow$ SRAM-only (12$\times$ faster)
\end{itemize}
\end{columns}
\vspace{1em}
\textbf{Key insight:} mlsysim automatically selects the right memory tier:
\begin{enumerate}
\item Model fits in SRAM $\Rightarrow$ use SRAM bandwidth
\item Model in Flash $\Rightarrow$ use Flash bandwidth (bottleneck!)
\item Model exceeds Flash $\Rightarrow$ infeasible
\end{enumerate}
\end{frame}
% --- 7.4 Energy per Inference ---
\begin{frame}{Energy per Inference: $\mu$J to Joules}
\note{[2 min] 6 orders of magnitude in energy. At TinyML scale, battery life is the constraint.}
\centering
\begin{tabular}{lrrl}
\toprule
\textbf{Device} & \textbf{TDP} & \textbf{Energy/Inf} & \textbf{Use Case} \\
\midrule
nRF52840 & 15\,mW & $\sim$50\,$\mu$J & Keyword spotting \\
ESP32-S3 & 400\,mW & $\sim$1\,mJ & Person detection \\
Jetson Orin NX & 25\,W & $\sim$25\,mJ & Object detection \\
\rowcolor{gray!15}
H100 SXM & 700\,W & $\sim$50\,J & LLM inference \\
\bottomrule
\end{tabular}
\vspace{1em}
\begin{itemize}
\item 6 orders of magnitude in energy per inference
\item TinyML: battery life is the primary constraint
\item Duty cycling (sleep 99\% of the time) extends battery to years
\end{itemize}
\end{frame}
% --- 7.5 Live Demo: Hardware Comparison ---
\begin{frame}[fragile]{Live Demo: nRF52840 vs ESP32 vs H100}
\note{[3 min] Run live. nRF52840 is memory-bound (Flash). H100 finishes in microseconds.}
\begin{lstlisting}
tiny_model = mlsysim.Models.Tiny.KeywordSpotting
devices = [mlsysim.Hardware.Tiny.nRF52840,
mlsysim.Hardware.Tiny.ESP32_S3,
mlsysim.Hardware.Cloud.H100]
for d in devices:
p = mlsysim.Engine.solve(tiny_model, d,
precision="int8", efficiency=0.3)
print(f"{d.name:>15}: {p.latency.to('ms'):.2f~P}"
f" | {p.bottleneck:<8}")
\end{lstlisting}
\vspace{0.3em}
\begin{exampleblock}{Observation}
nRF52840 is memory-bound (Flash bottleneck).
H100 finishes in $\mu$s---but wastes 700\,W doing it.
\textbf{Right-sizing hardware matters.}
mlsysim also supports \textbf{Coral Edge TPU}, Jetson Orin, and Inferentia2.
\end{exampleblock}
\end{frame}
% --- 7.6 Same Roofline, Different Physics ---
\begin{frame}[fragile]{Same Roofline, Different Physics}
\note{[2 min] Both are memory-bound at batch size 1. Same equation, same diagnosis.}
\begin{columns}[T]
\column{0.5\textwidth}
\begin{center}
\[
\text{Latency} = \max\!\left(
\frac{\text{FLOPs}}{\text{Peak} \times \eta},\;
\frac{\text{Weights}}{\text{BW}}
\right)
\]
\end{center}
\vspace{0.3em}
\scriptsize
\begin{tabular}{lll}
\toprule
\textbf{Term} & \textbf{Cloud} & \textbf{TinyML} \\
\midrule
Peak & 989\,TF & 500\,MF \\
BW & 3.35\,TB/s & 80\,MB/s \\
Overhead & 0.01\,ms & 1.0\,ms \\
\bottomrule
\end{tabular}
\column{0.47\textwidth}
\begin{lstlisting}
# Same API, different hardware
for d in [mlsysim.Hardware.Cloud.H100,
mlsysim.Hardware.Tiny.ESP32_S3]:
p = mlsysim.Engine.solve(
mlsysim.Models.Tiny
.KeywordSpotting,
d, precision="int8")
print(f"{d.name}: {p.bottleneck}")
\end{lstlisting}
\end{columns}
\vspace{0.3em}
\centering
\small
\textbf{Both are memory-bound at batch size 1.} Same equation, same diagnosis.
\end{frame}
% --- 7.7 Exercise ---
\begin{frame}[fragile]{Exercise: Right-Sizing for IoT}
\begin{alertblock}{Your Task (5 minutes)}
A smart doorbell needs person detection at $<$100\,ms latency on a
coin-cell battery (500\,mAh @ 3V = 1.5\,Wh). Compare ESP32-S3 vs
nRF52840: which device can run inference for a full year at 1 inference/minute?
\end{alertblock}
\begin{lstlisting}
import mlsysim
model = mlsysim.Models.Tiny.PersonDetection
for hw in [mlsysim.Hardware.Tiny.ESP32_S3,
mlsysim.Hardware.Tiny.nRF52840]:
p = mlsysim.Engine.solve(model, hw,
precision="int8", efficiency=0.3)
energy_j = p.energy.to("J").magnitude
inferences_per_year = 60 * 24 * 365
# YOUR CODE: total energy vs 1.5 Wh battery
\end{lstlisting}
\note{[5 min] Expected: nRF52840 at 15\,mW lasts $>$1 year with duty cycling.
ESP32-S3 at 400\,mW drains the battery in weeks.
Turn to your neighbor: did you get the same answer? Why or why not? 60 seconds.}
\end{frame}
% --- 7.8 Takeaway ---
\begin{frame}{Key Takeaway: TinyML to Frontier}
\note{[1 min] Three points. Right-size hardware to the workload.}
\begin{center}
\Large
\begin{enumerate}
\item The Roofline model spans \textbf{9 orders of magnitude}
\item \textbf{Right-size} hardware to the workload, not the other way around
\item At TinyML scale, \textbf{energy and memory} dominate; at cloud scale, \textbf{cost and communication}
\end{enumerate}
\vspace{1em}
\normalsize
\textit{``The best accelerator is the smallest one\\
that meets your latency and accuracy SLA.''}
\end{center}
\end{frame}
% =====================================================================
% PART 8: ADVANCED TOPICS
% =====================================================================
\section{Advanced Topics}
% --- 8.1 Key Question ---
\begin{frame}{Key Question}
\note{[1 min] Single-wall analysis is not enough. You need the full pipeline.}
\begin{center}
\Large\bfseries
How do you compose multiple analytical models\\[4pt]
into an end-to-end system analysis?
\end{center}
\vfill
\small
Single-wall analysis answers ``what is the bottleneck?''\\
\textbf{Pipeline composition} answers ``what is the total cost of the whole stack?''
\end{frame}
% --- 8.2 Pipeline Composition Pattern ---
\begin{frame}[fragile]{The Pipeline Composition Pattern}
\note{[3 min] Show the Pipeline pattern. Each stage feeds the next.}
\begin{lstlisting}
from mlsysim.core.pipeline import Pipeline
from mlsysim.core.solver import (
ScalingModel, DistributedModel, EconomicsModel)
# Chain: Algorithm -> Fleet -> Economics
pipe = Pipeline([
ScalingModel(), # Wall 11: compute budget
DistributedModel(), # Wall 14: comms overhead
EconomicsModel(), # Wall 17: total cost
])
print(pipe.explain()) # shows DAG + gaps
\end{lstlisting}
\vspace{0.5em}
\texttt{explain()} shows:
\begin{itemize}
\item What each stage requires and produces
\item Which walls from the 22-wall taxonomy are covered
\item Where gaps exist (missing inputs)
\end{itemize}
\end{frame}
% --- 8.3 Pipeline Run ---
\begin{frame}[fragile]{Running the Pipeline}
\note{[3 min] Run live. Show all stages and their outputs.}
\begin{lstlisting}
from mlsysim.core.constants import Q_
results = pipe.run(
compute_budget=Q_("1e21 FLOP"),
fleet=mlsysim.Systems.Clusters.Research_256,
duration_days=30,
datacenter=mlsysim.Infra.Grids.Quebec,
mfu=0.45,
infrastructure_multiplier=2.5)
for stage_name, result in results.items():
print(f"\n--- {stage_name} ---")
print(result)
\end{lstlisting}
\vspace{0.5em}
\textbf{Key design principle:} Each resolver's output fields become available
as inputs to subsequent stages. The pipeline is \emph{not} a black box---it
is a transparent analytical tool.
\end{frame}
% --- 8.4 Sensitivity Analysis ---
\begin{frame}[fragile]{Wall 21: Sensitivity Analysis}
\note{[3 min] ``Which knob should I turn next?'' The parameter with the largest partial derivative.}
\begin{lstlisting}
solver = mlsysim.SensitivitySolver()
result = solver.solve(
model=llama, hardware=hw, precision="fp16",
perturbation_pct=10.0, efficiency=0.5)
print(f"Binding: {result.binding_constraint}")
for p, s in result.sensitivities.items():
tag = "<<<" if p == result.binding_constraint else ""
print(f" {p:>20}: {s:+.4f} {tag}")
\end{lstlisting}
\vspace{0.3em}
\small
\textbf{Rule:} Invest in the parameter with the \emph{largest} $|\partial T / \partial p|$.\\
Improving a non-binding parameter yields \textbf{zero} measurable gain.
\end{frame}
% --- 8.5 Inverse Roofline (Synthesis) ---
\begin{frame}[fragile]{Wall 22: Inverse Roofline (SynthesisSolver)}
\note{[3 min] Most powerful move: derive hardware requirements from SLA. ``I need 50ms. What hardware?''}
\begin{lstlisting}
from mlsysim.core.constants import Q_
solver = mlsysim.SynthesisSolver()
result = solver.solve(
model=llama, target_latency=Q_("50 ms"),
precision="fp16", efficiency=0.5)
print(f"Required BW: {result.required_bw:.0f~P}")
print(f"Required FLOPS: {result.required_flops:.1f~P}")
\end{lstlisting}
\vspace{0.5em}
\textbf{Usage:} ``I need 50\,ms TTFT. What hardware do I need?''\\
$\Rightarrow$ Derive specs from SLA, then match to real accelerators.
\end{frame}
% --- 8.6 Fallacies & Pitfalls ---
\begin{frame}{Fallacies \& Pitfalls (Patterson Tradition)}
\note{[2 min] Walk through each fallacy with quantitative rebuttal.}
\begin{block}{Fallacy: ``Cheaper hardware is always more cost-effective''}
Reality: A 2$\times$ cheaper GPU that takes 3$\times$ longer has \emph{higher} TCO.\\
$\text{TCO} = \text{CapEx} + \text{OpEx}$; slower hardware burns more electricity.
\end{block}
\begin{block}{Fallacy: ``More FLOPS = proportionally faster''}
Reality: A100 $\to$ H100 is 6.3$\times$ peak FLOPS but only $\sim$2$\times$ for
memory-bound LLM inference. The binding constraint is bandwidth, not compute.
\end{block}
\begin{block}{Pitfall: Optimizing a non-binding parameter}
If inference is memory-bound, doubling FLOPS gives 0\% speedup.\\
Use \texttt{SensitivitySolver} to find the binding constraint \emph{first}.
\end{block}
\begin{block}{Pitfall: Ignoring the infrastructure multiplier}
GPU cost is 40\% of total datacenter TCO. Budgeting for GPUs alone\\
underestimates the true cost by 2--3$\times$.
\end{block}
\end{frame}
% --- 8.7 The Iron Law Revisited ---
\begin{frame}{The Iron Law Revisited: All Five Terms}
\note{[2 min] Return to the master equation. Every wall maps to one of these five terms.}
\begin{center}
\Large
\[
T = \frac{\text{FLOPs}}{N \times \text{Peak} \times \text{MFU} \times \eta_{\text{scale}} \times \text{Goodput}}
\]
\end{center}
\vspace{0.5em}
\centering
\begin{tabular}{lll}
\toprule
\textbf{Term} & \textbf{What reduces it} & \textbf{Wall} \\
\midrule
$N$ & Budget (buy more GPUs) & Wall 17 \\
Peak & GPU generation (H100$\to$B200)& Wall 1 \\
MFU & FlashAttention, kernel fusion & Wall 3 \\
$\eta_{\text{scale}}$& Network BW, gradient compression & Wall 14 \\
Goodput & Checkpointing, fault tolerance & Walls 15, 19 \\
\bottomrule
\end{tabular}
\vspace{0.5em}
\textbf{Every wall in the 22-wall taxonomy maps to one of these five terms.}\\
That is the entire framework.
\end{frame}
% --- 8.8 Exercise ---
\begin{frame}[fragile]{Exercise: Binding Constraint Analysis}
\note{[5 min] At bs=1, memory bandwidth is binding. At bs=64, peak FLOPS. This IS the Roofline transition.
Turn to your neighbor: did you get the same answer? Why or why not? 60 seconds.}
\begin{alertblock}{Your Task (5 minutes)}
Use \texttt{SensitivitySolver} on Llama-3-8B + H100.
Which parameter is binding? Now switch to \texttt{batch\_size=64} and
re-run. Does the binding constraint change?
\end{alertblock}
\begin{lstlisting}
solver = mlsysim.SensitivitySolver()
# Batch size 1
r1 = solver.solve(llama, hw, efficiency=0.5)
print(f"bs=1: binding = {r1.binding_constraint}")
# Batch size 64
p64 = mlsysim.Engine.solve(llama, hw, batch_size=64)
print(f"bs=64: bottleneck = {p64.bottleneck}")
\end{lstlisting}
\end{frame}
% =====================================================================
% PART 9: WRAP-UP & FUTURE
% =====================================================================
\section{Wrap-up \& Future}
% --- 9.1 The 22 Walls in One Slide ---
\begin{frame}{The 22 Walls of ML Systems}
\note{[2 min] Full recap. Every wall maps to one Iron Law term.}
\centering
\small
\begin{tabular}{clcl}
\toprule
\textbf{\#} & \textbf{Wall} & \textbf{\#} & \textbf{Wall} \\
\midrule
1 & Compute & 12 & Reasoning (Emerging) \\
2 & Memory & 13 & Fidelity (Compression) \\
3 & Software (MFU) & 14 & Communication \\
4 & Serving & 15 & Fragility (Reliability) \\
5 & Batching (KV) & 16 & Multi-tenant (Queueing) \\
6 & Streaming & 17 & Capital (TCO) \\
7 & Tail Latency & 18 & Sustainability \\
8 & Ingestion & 19 & Checkpoint \\
9 & Transformation & 20 & Safety (Privacy) \\
10 & Locality & 21 & Sensitivity \\
11 & Complexity & 22 & Synthesis (Inverse) \\
\bottomrule
\end{tabular}
\vspace{0.5em}
\textbf{6 Domains:} Node $\cdot$ Data $\cdot$ Algorithm $\cdot$ Fleet $\cdot$ Operations $\cdot$ Meta-Analysis
\vspace{0.3em}
\alert{Every wall maps to one term in the Iron Law.}
\end{frame}
% --- 9.2 What This Tool Does NOT Model ---
\begin{frame}{What \texttt{mlsysim} Does \emph{Not} Model}
\note{[2 min] Honest about limitations. Not a simulator, not a profiler. A reasoning framework.}
\begin{columns}[T]
\column{0.5\textwidth}
\textbf{Not modeled (v0.1.0):}
\begin{itemize}
\item Cache effects / tiling / fusion
\item Real network congestion / jitter
\item CUDA kernel scheduling
\item Multi-model co-location
\item Compiler optimizations
\end{itemize}
\column{0.5\textwidth}
\textbf{By design:}
\begin{itemize}
\item Not a cycle-accurate simulator
\item Not a profiler replacement
\item Not a deployment tool
\item \textbf{Is:} a first-principles\\
analytical reasoning tool
\end{itemize}
\vspace{0.5em}
\textbf{Use it for:}
\begin{itemize}
\item ``Which constraint is binding?''
\item ``Is this hardware sufficient?''
\item ``What should I try first?''
\end{itemize}
\end{columns}
\vspace{0.5em}
\centering
\small\itshape
``All models are wrong, but some are useful.'' --- George Box
\end{frame}
% --- 9.3 v0.2.0 Roadmap ---
\begin{frame}{v0.2.0 Roadmap}
\note{[2 min] Quick overview. Contributions welcome.}
\small
\begin{columns}[T]
\begin{column}{0.48\textwidth}
\textbf{Models \& Hardware}
\begin{enumerate}
\item \textbf{MoE support} --- Expert routing + parallelism
\item \textbf{GQA / MQA} --- Grouped-query KV cache
\item \textbf{Network congestion} --- Contention-aware collectives
\end{enumerate}
\end{column}
\begin{column}{0.48\textwidth}
\textbf{Tooling \& Integration}
\begin{enumerate}\setcounter{enumi}{3}
\item \textbf{Spot pricing} --- Cloud cost optimization
\item \textbf{Marimo dashboards} --- Live exploration
\item \textbf{HuggingFace Hub} --- Auto-import models
\end{enumerate}
\end{column}
\end{columns}
\vspace{0.5em}
\begin{center}
Contributions welcome: \texttt{github.com/harvard-edge/mlsysim}
\end{center}
\end{frame}
% --- 9.4 Design Challenge (Capstone) ---
\begin{frame}[fragile]{Design Challenge: Capstone Exercise}
\begin{alertblock}{The Problem}
\textbf{\$5M budget.} Serve Llama-3 70B at \textbf{1{,}000 QPS} with
\textbf{$<$100\,ms TTFT} in \textbf{two regions} (US-East + EU-West).
Design the fleet.
\end{alertblock}
\vspace{0.3em}
\begin{columns}[T]
\column{0.48\textwidth}
\textbf{You must specify:}
\begin{enumerate}\footnotesize
\item Hardware choice (which GPU? how many?)
\item Parallelism strategy (TP $\times$ PP)
\item Precision (FP16? INT8? FP8?)
\item Geographic placement (carbon?)
\item Redundancy (replicas per region?)
\end{enumerate}
\column{0.49\textwidth}
\begin{lstlisting}
import mlsysim
# Capstone starter code
model = mlsysim.Models.Language.Llama3_70B
hw = mlsysim.Hardware.H100
# Step 1: Can it fit?
p = mlsysim.Engine.solve(
model, hw, precision="fp16")
print(f"Feasible: {p.feasible}")
print(f"Latency: {p.latency:~P}")
# Step 2: TCO for your fleet
econ = mlsysim.EconomicsModel()
# YOUR CODE: design the fleet
\end{lstlisting}
\end{columns}
\note{[35 min] This is intentionally under-specified. Students must make and defend
assumptions. Key insight: 70B at FP16 = 140\,GB, needs tensor parallelism
across 2+ H100s per replica. At 1000 QPS with 100\,ms TTFT, you need many
replicas. Budget constrains how many.}
\end{frame}
% --- 9.4b: Personal Transfer Moment ---
\begin{frame}{Your Turn: Name That Wall}
\note{[3 min] THE TRANSFER MOMENT. Silent writing. Then 2-3 volunteers share.}
\centering
\Large
\textbf{Think of one ML system in your own work.}\\[1cm]
\normalsize
Write down:\\[0.3cm]
\begin{enumerate}
\item What is the system? (model + hardware + use case)
\item Which of the 22 walls is the \textbf{binding constraint}?
\item What would you change to move the wall?
\end{enumerate}
\vfill
\small\textcolor{gray}{2 minutes. Then we will hear from 3 volunteers.}
\end{frame}
% --- 9.5 Resources & Q&A ---
\begin{frame}{Resources \& Next Steps}
\note{[3 min] Point to: mlsysim GitHub, textbook, cheatsheet. ``Take a photo of the cheatsheet.''}
\begin{columns}[T]
\column{0.55\textwidth}
\textbf{Get Started}
\begin{itemize}
\item \texttt{pip install mlsysim}
\item GitHub: \texttt{harvard-edge/mlsysim}
\item 14 examples + 3 Marimo notebooks
\item Full docs: \texttt{mlsysim.readthedocs.io}
\end{itemize}
\vspace{0.5em}
\textbf{The Textbook}
\begin{itemize}
\item \emph{Machine Learning Systems}
\item Volume I: Foundations (single node)
\item Volume II: Systems at Scale (fleet)
\item \texttt{mlsysbook.ai}
\end{itemize}
\column{0.40\textwidth}
\textbf{Key Papers}
\begin{itemize}
\item Williams et al.\ (2009)\\
{\footnotesize Roofline Model}
\item Chowdhery et al.\ (2022)\\
{\footnotesize PaLM / MFU}
\item Hoffmann et al.\ (2022)\\
{\footnotesize Chinchilla Scaling}
\item Patterson et al.\ (2021)\\
{\footnotesize Carbon \& Training}
\item Kwon et al.\ (2023)\\
{\footnotesize PagedAttention}
\end{itemize}
\end{columns}
\vspace{1em}
\begin{center}
\Large\bfseries Thank you! Questions?
\end{center}
\end{frame}
\end{document}