mirror of
https://github.com/harvard-edge/cs249r_book.git
synced 2026-05-06 17:49:07 -05:00
1183 lines
37 KiB
TeX
1183 lines
37 KiB
TeX
% =============================================================================
|
||
% MLSys·im Tutorial Tutorial — Parts 5–9 (Afternoon Session)
|
||
% =============================================================================
|
||
\documentclass[aspectratio=169, 12pt]{beamer}
|
||
\usepackage{../../../slides/assets/beamerthememlsys}
|
||
|
||
\mlsyssetup{
|
||
volume = {Tutorial},
|
||
chapter = {Tutorial},
|
||
logo = {../../../slides/assets/img/logo-mlsysbook.png},
|
||
instlogo = {../../../slides/assets/img/logo-harvard.png},
|
||
chaptertitle = {MLSys·im — Afternoon Session},
|
||
}
|
||
|
||
% --- Fonts ---
|
||
\usepackage{fontspec}
|
||
\setsansfont{Helvetica Neue}[
|
||
BoldFont={Helvetica Neue Bold},
|
||
ItalicFont={Helvetica Neue Italic},
|
||
BoldItalicFont={Helvetica Neue Bold Italic},
|
||
]
|
||
% Use Courier if JetBrains Mono not available
|
||
\IfFontExistsTF{JetBrains Mono}{
|
||
\setmonofont{JetBrains Mono}[Scale=0.85]
|
||
}{
|
||
\setmonofont{Courier New}[Scale=0.90]
|
||
}
|
||
|
||
% --- Packages ---
|
||
\usepackage{amsmath}
|
||
\usepackage{booktabs}
|
||
\usepackage[table]{xcolor}
|
||
\usepackage{listings}
|
||
\usepackage{tikz}
|
||
|
||
% --- Code listings ---
|
||
\lstset{
|
||
language=Python,
|
||
basicstyle=\ttfamily\footnotesize,
|
||
keywordstyle=\color{crimson}\bfseries,
|
||
stringstyle=\color{datastroke},
|
||
commentstyle=\color{midgray}\itshape,
|
||
backgroundcolor=\color{computeblue!20},
|
||
frame=single,
|
||
rulecolor=\color{computestroke},
|
||
numbers=none,
|
||
breaklines=true,
|
||
columns=fullflexible,
|
||
keepspaces=true,
|
||
showstringspaces=false,
|
||
xleftmargin=4pt,
|
||
xrightmargin=4pt,
|
||
aboveskip=6pt,
|
||
belowskip=4pt,
|
||
}
|
||
|
||
% --- Convenience macros ---
|
||
\newcommand{\mlsysim}{\texttt{mlsysim}}
|
||
|
||
% --- Image paths ---
|
||
\graphicspath{{images/}}
|
||
|
||
% --- Section count (must match actual \section{} count) ---
|
||
\setcounter{mlsystotalsections}{5}
|
||
|
||
\title{MLSys·im Tutorial --- Afternoon Session}
|
||
\subtitle{Parts 5--9: Economics, DSE, TinyML, Advanced, Wrap-up}
|
||
\author{Vijay Janapa Reddi}
|
||
\institute{Harvard University}
|
||
\date{Tutorial}
|
||
|
||
\begin{document}
|
||
|
||
% --- Roadmap: After Break ---
|
||
\begin{frame}{Roadmap: You Are Here}
|
||
\note{[1 min] Quick orientation after break.}
|
||
|
||
\centering\small
|
||
\begin{tabular}{rll}
|
||
\toprule
|
||
\textbf{Time} & \textbf{Part} & \textbf{Status} \\
|
||
\midrule
|
||
1:00--2:15 & Part 4: Going Distributed & \checkmark \\
|
||
\rowcolor{crimson!12}
|
||
2:30--3:15 & \textbf{Part 5: Economics \& Sustainability} & \textbf{$\leftarrow$ You are here} \\
|
||
3:15--3:45 & Part 6: Design Space Exploration & \\
|
||
3:45--4:15 & Part 7: TinyML to Frontier & \\
|
||
4:15--4:45 & Part 8: Advanced Topics & \\
|
||
4:45--5:00 & Part 9: Wrap-Up \& Capstone & \\
|
||
\bottomrule
|
||
\end{tabular}
|
||
\end{frame}
|
||
|
||
% =====================================================================
|
||
% PART 5: ECONOMICS & SUSTAINABILITY
|
||
% =====================================================================
|
||
\section{Economics \& Sustainability}
|
||
|
||
% --- 5.1 Key Question ---
|
||
\begin{frame}{Key Question}
|
||
\note{[1 min] Open with the dollar question. Pause dramatically.}
|
||
\begin{center}
|
||
\Large\bfseries
|
||
How much does it \emph{really} cost to train and serve an LLM---\\[4pt]
|
||
in dollars, kilowatt-hours, and tonnes of CO$_2$?
|
||
\end{center}
|
||
\vfill
|
||
\begin{quotation}
|
||
\small\itshape
|
||
``A 1024-GPU H100 cluster costs \$30M in hardware. Electricity is
|
||
surprisingly small ($\sim$10\% of total). The dominant cost lever is
|
||
utilization---idle GPUs cost the same as busy ones.''
|
||
\end{quotation}
|
||
\end{frame}
|
||
|
||
% --- 5.2 The TCO Equation ---
|
||
\begin{frame}[fragile]{Wall 17: The Capital Wall (TCO)}
|
||
\note{[3 min] Run the TCO code live. Key surprise: GPUs are only 40\% of total cost.}
|
||
\begin{columns}[T]
|
||
\column{0.55\textwidth}
|
||
\begin{block}{The Equation}
|
||
\[
|
||
\text{TCO} = \underbrace{\text{CapEx}}_{\text{hw + facility}}
|
||
+ \underbrace{\text{OpEx}_{\text{energy}}}_{\text{electricity}}
|
||
+ \underbrace{\text{OpEx}_{\text{maint}}}_{\text{staff}}
|
||
\]
|
||
\end{block}
|
||
\vspace{0.3em}
|
||
\small
|
||
\textbf{Key insight:} GPUs are only $\sim$40\% of total CapEx.\\
|
||
Networking, cooling, facility, and staff add 50--150\%.
|
||
|
||
\column{0.42\textwidth}
|
||
\begin{lstlisting}
|
||
econ = mlsysim.EconomicsModel()
|
||
r = econ.solve(
|
||
fleet=mlsysim.Systems
|
||
.Clusters.Research_256,
|
||
duration_days=30,
|
||
infrastructure_multiplier=2.5)
|
||
print(f"TCO: ${r.tco_usd:,.0f}")
|
||
\end{lstlisting}
|
||
\end{columns}
|
||
|
||
\vspace{0.3em}
|
||
\small
|
||
\begin{itemize}\setlength\itemsep{1pt}
|
||
\item \texttt{infrastructure\_multiplier = 2.0--2.5} for full datacenter TCO
|
||
\item Amortized over 3--5 year depreciation schedule
|
||
\item Cloud spot instances can beat on-prem for bursty workloads
|
||
\end{itemize}
|
||
\end{frame}
|
||
|
||
% --- 5.3 Infrastructure Multiplier ---
|
||
\begin{frame}{The Infrastructure Multiplier}
|
||
\note{[2 min] Walk through the cost breakdown table. Emphasize: budgeting only for GPUs underestimates by 2-3x.}
|
||
\centering
|
||
\begin{tabular}{lrr}
|
||
\toprule
|
||
\textbf{Component} & \textbf{Cost (\%)} & \textbf{Multiplier} \\
|
||
\midrule
|
||
GPU Hardware & 40\% & 1.0$\times$ \\
|
||
Network (IB) & 20\% & --- \\
|
||
Power \& Cooling & 15\% & --- \\
|
||
Facility / Land & 10\% & --- \\
|
||
Staff / Operations & 15\% & --- \\
|
||
\midrule
|
||
\textbf{Total} & 100\% & \textbf{2.5$\times$} \\
|
||
\bottomrule
|
||
\end{tabular}
|
||
|
||
\vspace{1em}
|
||
\alert{Pitfall:} Budgeting only for GPUs underestimates TCO by 2--3$\times$.
|
||
\end{frame}
|
||
|
||
% --- 5.4 The Sustainability Equation ---
|
||
\begin{frame}[fragile]{Wall 18: The Sustainability Wall}
|
||
\note{[3 min] Three levers: energy, PUE, carbon intensity. Geography dominates.}
|
||
\begin{columns}[T]
|
||
\column{0.55\textwidth}
|
||
\begin{block}{The Equation}
|
||
\small
|
||
\[
|
||
\text{CO}_2 = \text{Energy} \times \text{PUE} \times \text{CI}
|
||
\]
|
||
\end{block}
|
||
\vspace{0.3em}
|
||
\small
|
||
\textbf{Three levers:}
|
||
\begin{enumerate}\setlength\itemsep{1pt}
|
||
\item \textbf{Energy} --- Better MFU, smaller model
|
||
\item \textbf{PUE} --- Liquid cooling (1.3 $\to$ 1.06)
|
||
\item \textbf{Carbon intensity} --- Choose your grid
|
||
\end{enumerate}
|
||
|
||
\column{0.42\textwidth}
|
||
\begin{lstlisting}
|
||
sus = mlsysim.SustainabilityModel()
|
||
r = sus.solve(
|
||
fleet=mlsysim.Systems
|
||
.Clusters.Research_256,
|
||
datacenter=mlsysim.Infra
|
||
.Grids.Quebec,
|
||
duration_days=30, mfu=0.45)
|
||
print(f"{r.carbon_footprint_kg
|
||
/1000:.1f} tonnes CO2")
|
||
\end{lstlisting}
|
||
\end{columns}
|
||
|
||
\vspace{0.3em}
|
||
\begin{center}
|
||
\alert{Geography is the single biggest lever for sustainable AI.}
|
||
\end{center}
|
||
\end{frame}
|
||
|
||
% --- 5.5 Geography Matters ---
|
||
\begin{frame}{Carbon Intensity: The 41$\times$ Gap}
|
||
\note{[2 min] Show the table. Poland vs Quebec: 41x difference. Same energy, different grid.}
|
||
\begin{columns}[T]
|
||
\column{0.45\textwidth}
|
||
\centering
|
||
\begin{tabular}{lcr}
|
||
\toprule
|
||
\textbf{Region} & \textbf{Mix} & \textbf{gCO$_2$} \\
|
||
\midrule
|
||
Quebec & Hydro & 20 \\
|
||
Sweden & Hydro+Nuc & 25 \\
|
||
US Avg & Mixed & 390 \\
|
||
Germany & Coal+Wind & 380 \\
|
||
Poland & Coal & 820 \\
|
||
\bottomrule
|
||
\end{tabular}
|
||
|
||
\vspace{0.5em}
|
||
\small
|
||
Training the \emph{same model}:\\
|
||
\textbf{Poland:} $\sim$800\,t\quad vs\quad \textbf{Quebec:} $\sim$20\,t\\[4pt]
|
||
$\Rightarrow$ \textbf{41$\times$} difference.
|
||
|
||
\column{0.52\textwidth}
|
||
\centering
|
||
\includegraphics[width=0.85\textwidth]{images/pdf/carbon-geography.pdf}
|
||
\end{columns}
|
||
\end{frame}
|
||
|
||
% --- 5.6 Embodied Carbon ---
|
||
\begin{frame}{Embodied Carbon: Manufacturing Dominates at the Edge}
|
||
\note{[2 min] Key insight: at cloud scale, operational carbon dominates. At TinyML scale, embodied carbon dominates.}
|
||
\begin{columns}[T]
|
||
\column{0.5\textwidth}
|
||
\textbf{Cloud / Training}
|
||
\begin{itemize}
|
||
\item Operational carbon dominates
|
||
\item GPU runs 24/7 for months
|
||
\item Embodied: $\sim$5\% of lifecycle
|
||
\end{itemize}
|
||
|
||
\column{0.5\textwidth}
|
||
\textbf{IoT / TinyML}
|
||
\begin{itemize}
|
||
\item Manufacturing carbon dominates
|
||
\item MCU uses microwatts
|
||
\item Embodied: $>$90\% of lifecycle
|
||
\item Multiplied by millions of devices
|
||
\end{itemize}
|
||
\end{columns}
|
||
|
||
\vspace{1em}
|
||
\begin{center}
|
||
\small
|
||
Source: Gupta et al.\ (2022), ``ACT: Designing Sustainable Computer\\
|
||
Systems with an Architectural Carbon Modeling Tool''
|
||
\end{center}
|
||
\end{frame}
|
||
|
||
% --- 5.7 Live Demo: Economics ---
|
||
\begin{frame}[fragile]{Live Demo: TCO + Carbon Analysis}
|
||
\note{[3 min] Run live. Show TCO and carbon side by side.}
|
||
\begin{lstlisting}
|
||
fleet = mlsysim.Systems.Clusters.Research_256
|
||
econ = mlsysim.EconomicsModel()
|
||
result = econ.solve(
|
||
fleet=fleet, duration_days=30,
|
||
datacenter=mlsysim.Infra.Grids.Quebec,
|
||
mfu=0.45, infrastructure_multiplier=2.5,
|
||
amortization_years=3.0)
|
||
print(f"TCO: ${result.tco_usd:,.0f}")
|
||
print(f"Carbon: {result.carbon_footprint_kg/1000:.1f} t")
|
||
\end{lstlisting}
|
||
\end{frame}
|
||
|
||
% --- 5.8 Live Demo: Geography Comparison ---
|
||
\begin{frame}[fragile]{Live Demo: The Carbon Geography Experiment}
|
||
\note{[2 min] Run live. Same energy, 41x less carbon. Geography wins.}
|
||
\begin{lstlisting}
|
||
fleet = mlsysim.Systems.Clusters.Research_256
|
||
solver = mlsysim.SustainabilityModel()
|
||
for region in [mlsysim.Infra.Grids.Poland,
|
||
mlsysim.Infra.Grids.Quebec]:
|
||
r = solver.solve(fleet=fleet, duration_days=30,
|
||
datacenter=region, mfu=0.45)
|
||
print(f"{r.region_name:>12}: "
|
||
f"{r.carbon_footprint_kg/1000:.1f} t CO2")
|
||
\end{lstlisting}
|
||
|
||
\vspace{0.5em}
|
||
\begin{exampleblock}{Expected Output}
|
||
\ttfamily\scriptsize
|
||
\hspace{1em}Poland: 1,064 MWh, 872.5 tonnes CO2\\
|
||
\hspace{1em}Quebec: 1,064 MWh, \phantom{0}21.3 tonnes CO2
|
||
\end{exampleblock}
|
||
|
||
\vspace{0.3em}
|
||
\alert{Same energy. 41$\times$ less carbon. Geography wins.}
|
||
\end{frame}
|
||
|
||
% --- 5.9 Exercise Part A: Carbon-Aware Placement ---
|
||
\begin{frame}[fragile]{Exercise 5a: Carbon-Aware Placement}
|
||
\note{[3 min] Expected answer: $\sim$850 tonnes CO$_2$ saved.
|
||
The energy is identical ($\sim$2{,}100 MWh)---only carbon intensity differs.
|
||
Key insight: the cheapest carbon reduction is a \texttt{git push} to a different region.
|
||
Turn to your neighbor: did you get the same answer? Why or why not? 60 seconds.}
|
||
|
||
\begin{alertblock}{Your Task (3 minutes)}
|
||
Compare a 30-day training run on 512 H100s:
|
||
\textbf{Poland} vs \textbf{Quebec}. How many tonnes of CO$_2$ saved?
|
||
\end{alertblock}
|
||
|
||
\begin{lstlisting}
|
||
fleet = mlsysim.Fleet(
|
||
name="Training Fleet",
|
||
node=mlsysim.Systems.Nodes.DGX_H100,
|
||
count=64,
|
||
fabric=mlsysim.Systems.Fabrics.InfiniBand_NDR)
|
||
solver = mlsysim.SustainabilityModel()
|
||
# YOUR CODE: solve for Poland and Quebec
|
||
\end{lstlisting}
|
||
\end{frame}
|
||
|
||
% --- 5.9b Exercise Part B: Multi-Vendor TCO Shootout ---
|
||
\begin{frame}[fragile]{Exercise 5b: Multi-Vendor TCO Shootout}
|
||
\note{[5 min] THE ISCA QUESTION. Which vendor gives the cheapest training?
|
||
Expected: MI300X cluster is cheapest per FLOP (192 GB HBM means fewer
|
||
nodes needed for large models), but H100 has the best software ecosystem.
|
||
Gaudi\,3 is competitive on raw cost. The answer depends on model size.
|
||
Turn to your neighbor: did you get the same answer? Why or why not? 60 seconds.}
|
||
|
||
\begin{alertblock}{Your Task (5 minutes)}
|
||
Same workload: Llama-3 70B, 30-day training run, 512 accelerators.\\
|
||
Which cluster is cheapest for the \textbf{same throughput}?
|
||
\end{alertblock}
|
||
|
||
\begin{lstlisting}
|
||
econ = mlsysim.EconomicsModel()
|
||
for cluster_name in ["H100", "MI300X", "Gaudi3"]:
|
||
hw = getattr(mlsysim.Hardware.Cloud, cluster_name)
|
||
fleet = mlsysim.Fleet(name=cluster_name, hw=hw,
|
||
count=64)
|
||
r = econ.solve(fleet=fleet, duration_days=30,
|
||
infrastructure_multiplier=2.5)
|
||
print(f"{cluster_name:>8}: TCO=${r.tco_usd:,.0f}")
|
||
\end{lstlisting}
|
||
|
||
\small
|
||
\alert{The cheapest accelerator depends on the workload's binding constraint.}
|
||
\end{frame}
|
||
|
||
% --- 5.10 Takeaway ---
|
||
\begin{frame}{Key Takeaway: Economics \& Sustainability}
|
||
\note{[1 min] Three points, repeat each.}
|
||
\begin{center}
|
||
\Large
|
||
\begin{enumerate}
|
||
\item \textbf{TCO $\neq$ GPU cost.} Infrastructure is 2--2.5$\times$.
|
||
\item \textbf{Utilization is the cost lever.} Idle GPUs cost the same.
|
||
\item \textbf{Geography $>$ algorithms} for carbon reduction.
|
||
\end{enumerate}
|
||
|
||
\vspace{1em}
|
||
\normalsize
|
||
\textit{``The cheapest watt is the one you don't consume.\\
|
||
The cleanest watt is the one from a hydro dam.''}
|
||
\end{center}
|
||
\end{frame}
|
||
|
||
|
||
% =====================================================================
|
||
% PART 6: DESIGN SPACE EXPLORATION
|
||
% =====================================================================
|
||
\section{Design Space Exploration}
|
||
|
||
% --- 6.1 Key Question ---
|
||
\begin{frame}{Key Question}
|
||
\note{[1 min] Frame the search problem. The space is combinatorial.}
|
||
\begin{center}
|
||
\Large\bfseries
|
||
Given a budget and an SLA,\\[4pt]
|
||
how do you find the \emph{best} hardware configuration?
|
||
\end{center}
|
||
\vfill
|
||
\small
|
||
The search space is combinatorial:
|
||
$|\text{hardware}| \times |\text{batch sizes}| \times |\text{precisions}| \times |\text{parallelism configs}|$\\
|
||
can easily exceed $10^4$ configurations.
|
||
\end{frame}
|
||
|
||
% --- 6.2 The DSE Pattern ---
|
||
\begin{frame}[fragile]{The DSE Pattern: Declare, Search, Rank}
|
||
\note{[3 min] Three-step pattern. Emphasize: analytical models make exhaustive search feasible.}
|
||
\begin{columns}[T]
|
||
\column{0.5\textwidth}
|
||
\begin{enumerate}
|
||
\item \textbf{Declare} the search space:
|
||
\begin{itemize}\footnotesize
|
||
\item Hardware: \{A100, H100, H200, B200\}
|
||
\item Batch sizes: \{1, 2, 4, \ldots, 64\}
|
||
\item Precisions: \{fp16, int8, int4\}
|
||
\end{itemize}
|
||
\item \textbf{Search} with an objective:
|
||
\begin{itemize}\footnotesize
|
||
\item \texttt{minimize: tco\_usd}
|
||
\item \texttt{maximize: throughput}
|
||
\end{itemize}
|
||
\item \textbf{Rank} subject to constraints:
|
||
\begin{itemize}\footnotesize
|
||
\item \texttt{latency < 50 ms}
|
||
\item \texttt{feasible == True}
|
||
\end{itemize}
|
||
\end{enumerate}
|
||
|
||
\column{0.47\textwidth}
|
||
\begin{lstlisting}
|
||
from mlsysim.core.dse import DSE
|
||
|
||
dse = DSE(
|
||
space={
|
||
"batch_size": [1,4,8,16,32],
|
||
"precision": ["fp16","int8"],
|
||
},
|
||
objective="maximize: throughput",
|
||
constraints=["latency < 100"])
|
||
\end{lstlisting}
|
||
\vspace{0.3em}
|
||
\small
|
||
Analytical models $\Rightarrow$ each evaluation\\
|
||
takes $<$1\,ms $\Rightarrow$ exhaustive search.
|
||
\end{columns}
|
||
\end{frame}
|
||
|
||
% --- 6.3 Pareto Fronts ---
|
||
\begin{frame}{Pareto Fronts: No Free Lunch}
|
||
\note{[2 min] Explain the Pareto front. The knee is usually the sweet spot.}
|
||
\centering
|
||
\includegraphics[width=0.65\textwidth]{images/pdf/pareto-front.pdf}
|
||
|
||
\vspace{0.5em}
|
||
\textbf{Pareto front}: the set of configurations where\\
|
||
improving one metric \emph{must} worsen another.
|
||
|
||
\begin{itemize}
|
||
\item Lower latency $\Rightarrow$ lower throughput (smaller batch)
|
||
\item Higher throughput $\Rightarrow$ higher cost (more GPUs)
|
||
\item The ``knee'' of the curve is usually the sweet spot
|
||
\end{itemize}
|
||
\end{frame}
|
||
|
||
% --- 6.4 Live Demo: Engine.sweep ---
|
||
\begin{frame}[fragile]{Live Demo: Design Space Sweep}
|
||
\note{[3 min] Run live. Note multi-vendor hardware list: H100, MI300X, Gaudi3, B200.}
|
||
\begin{lstlisting}
|
||
hw_list = [mlsysim.Hardware.Cloud.H100,
|
||
mlsysim.Hardware.Cloud.MI300X,
|
||
mlsysim.Hardware.Cloud.Gaudi3,
|
||
mlsysim.Hardware.Cloud.B200]
|
||
results = mlsysim.Engine.sweep(
|
||
llama, hw_list, batch_sizes=[1, 4, 16, 64],
|
||
precisions=["fp16", "int8"], efficiency=0.5)
|
||
for r in sorted(results,
|
||
key=lambda x: x['profile'].latency.magnitude):
|
||
p = r['profile']
|
||
print(f"{r['hardware']:>8} bs={r['batch_size']:<3} "
|
||
f"| {p.bottleneck:<8} | MFU={p.mfu:.3f}")
|
||
\end{lstlisting}
|
||
\end{frame}
|
||
|
||
% --- 6.5 Live Demo: DSE with Constraints ---
|
||
\begin{frame}[fragile]{Live Demo: DSE with Objective \& Constraints}
|
||
\note{[3 min] Run live. Show maximize throughput subject to latency constraint.}
|
||
\begin{lstlisting}
|
||
from mlsysim.core.dse import DSE
|
||
|
||
dse = DSE(
|
||
space={"batch_size": [1,4,8,16,32,64],
|
||
"precision": ["fp16", "int8"]},
|
||
objective="maximize: throughput",
|
||
constraints=["latency < 100"])
|
||
def evaluate(params):
|
||
p = mlsysim.Engine.solve(llama, hw,
|
||
batch_size=params["batch_size"],
|
||
precision=params["precision"])
|
||
return {"throughput": p.throughput.magnitude,
|
||
"latency": p.latency.to("ms").magnitude}
|
||
best = dse.search(evaluate)
|
||
print(f"Best: {best['best_params']}")
|
||
\end{lstlisting}
|
||
\end{frame}
|
||
|
||
% --- 6.6 Batching Optimizer ---
|
||
\begin{frame}[fragile]{Live Demo: Batching Optimizer (Pareto Front)}
|
||
\note{[3 min] Run live. Show Pareto front of batch size vs latency.}
|
||
\begin{lstlisting}
|
||
opt = mlsysim.BatchingOptimizer()
|
||
result = opt.solve(
|
||
model=llama, hardware=hw, seq_len=128,
|
||
arrival_rate_qps=10.0,
|
||
sla_latency_ms=20000.0, precision="fp16")
|
||
print(f"Optimal bs: {result.best_batch_size}")
|
||
for pt in result.pareto_front:
|
||
print(f" bs={pt['batch_size']:<4} "
|
||
f"lat={pt['p99_latency'].m_as('ms'):.0f} ms")
|
||
\end{lstlisting}
|
||
\end{frame}
|
||
|
||
% --- 6.7 Exercise ---
|
||
\begin{frame}[fragile]{Exercise: Budget-Constrained Design}
|
||
\begin{alertblock}{Your Task (5 minutes)}
|
||
Given a \textbf{\$1M budget}, find the best hardware + batch size + precision
|
||
configuration for serving Llama-3-8B inference under a 50\,ms latency SLA.
|
||
\end{alertblock}
|
||
|
||
\begin{lstlisting}
|
||
# Hint: Use Engine.sweep across hardware tiers,
|
||
# filter by feasibility and latency SLA,
|
||
# then pick the highest-throughput config
|
||
# within budget (check unit_cost in Hardware)
|
||
\end{lstlisting}
|
||
|
||
\note{[5 min] Expected answer: H100 at INT8, batch size 16--32 gives the best
|
||
throughput/dollar under 50\,ms. A100 at FP16 fails the SLA at high batch sizes.
|
||
Turn to your neighbor: did you get the same answer? Why or why not? 60 seconds.}
|
||
\end{frame}
|
||
|
||
% --- 6.8 Takeaway ---
|
||
\begin{frame}{Key Takeaway: Design Space Exploration}
|
||
\note{[1 min] Three points, repeat each.}
|
||
\begin{center}
|
||
\Large
|
||
\begin{enumerate}
|
||
\item \textbf{Analytical models} make exhaustive DSE feasible ($<$1\,s for $10^4$ configs)
|
||
\item \textbf{Pareto fronts} reveal the actual tradeoff structure
|
||
\item \textbf{Constraints first}: filter infeasible, then optimize
|
||
\end{enumerate}
|
||
|
||
\vspace{1em}
|
||
\normalsize
|
||
\textit{``You cannot optimize what you cannot model.\\
|
||
You cannot model what you cannot measure.''}
|
||
\end{center}
|
||
\end{frame}
|
||
|
||
|
||
% =====================================================================
|
||
% PART 7: TINYML TO FRONTIER
|
||
% =====================================================================
|
||
\section{TinyML to Frontier}
|
||
|
||
% --- 7.1 Key Question ---
|
||
\begin{frame}{Key Question}
|
||
\note{[1 min] ``Same equation, 9 orders of magnitude apart.''}
|
||
\begin{center}
|
||
\Large\bfseries
|
||
Can the same analytical framework model\\[4pt]
|
||
a \$2 microcontroller \emph{and} a \$3M GPU rack?
|
||
\end{center}
|
||
\vfill
|
||
\small
|
||
The answer is yes---because the \textbf{Roofline model} is universal.\\
|
||
Only the numbers change. The physics is the same.
|
||
\end{frame}
|
||
|
||
% --- 7.2 The Nine Orders of Magnitude ---
|
||
\begin{frame}{The 9-Order-of-Magnitude Scale Span}
|
||
\note{[2 min] Walk through the table. Compute spans 10\^7x, power 10\^4.7x.}
|
||
\begin{columns}[T]
|
||
\column{0.52\textwidth}
|
||
\centering
|
||
\includegraphics[width=0.85\textwidth]{images/pdf/hardware-spectrum.pdf}
|
||
|
||
\column{0.45\textwidth}
|
||
\scriptsize
|
||
\begin{tabular}{lrr}
|
||
\toprule
|
||
\textbf{Device} & \textbf{FLOPS} & \textbf{TDP} \\
|
||
\midrule
|
||
nRF52840 & 64\,M & 15\,mW \\
|
||
ESP32-S3 & 500\,M & 400\,mW \\
|
||
\rowcolor{gray!15}
|
||
H100 SXM & 989\,T & 700\,W \\
|
||
\bottomrule
|
||
\end{tabular}
|
||
|
||
\vspace{0.5em}
|
||
\begin{tabular}{lr}
|
||
\textbf{Compute} & $\sim 10^{7}\times$ \\
|
||
\textbf{Mem BW} & $\sim 10^{7}\times$ \\
|
||
\textbf{Power} & $\sim 10^{4.7}\times$ \\
|
||
\end{tabular}
|
||
\end{columns}
|
||
|
||
\vspace{0.5em}
|
||
\centering
|
||
\alert{Same Roofline equation. Nine orders of magnitude apart.}
|
||
\end{frame}
|
||
|
||
% --- 7.3 TinyML Memory Hierarchy ---
|
||
\begin{frame}{Flash vs SRAM: The TinyML Memory Wall}
|
||
\note{[3 min] Key difference: TinyML has Flash (8 MB, 80 MB/s) vs Cloud HBM (80 GB, 3.35 TB/s).}
|
||
\begin{columns}[T]
|
||
\column{0.5\textwidth}
|
||
\textbf{Cloud GPU (H100)}
|
||
\begin{itemize}
|
||
\item Weights in HBM (80\,GB)
|
||
\item Activations in SRAM (50\,MB L2)
|
||
\item Bandwidth: 3.35\,TB/s
|
||
\item Model $\leq$ 80\,GB $\Rightarrow$ fits
|
||
\end{itemize}
|
||
|
||
\column{0.5\textwidth}
|
||
\textbf{TinyML MCU (ESP32-S3)}
|
||
\begin{itemize}
|
||
\item Weights in \textbf{Flash} (8\,MB)
|
||
\item Activations in \textbf{SRAM} (512\,KiB)
|
||
\item Flash BW: 80\,MB/s
|
||
\item Model $\leq$ 8\,MB $\Rightarrow$ fits
|
||
\item Model $\leq$ 512\,KiB $\Rightarrow$ SRAM-only (12$\times$ faster)
|
||
\end{itemize}
|
||
\end{columns}
|
||
|
||
\vspace{1em}
|
||
\textbf{Key insight:} mlsysim automatically selects the right memory tier:
|
||
\begin{enumerate}
|
||
\item Model fits in SRAM $\Rightarrow$ use SRAM bandwidth
|
||
\item Model in Flash $\Rightarrow$ use Flash bandwidth (bottleneck!)
|
||
\item Model exceeds Flash $\Rightarrow$ infeasible
|
||
\end{enumerate}
|
||
\end{frame}
|
||
|
||
% --- 7.4 Energy per Inference ---
|
||
\begin{frame}{Energy per Inference: $\mu$J to Joules}
|
||
\note{[2 min] 6 orders of magnitude in energy. At TinyML scale, battery life is the constraint.}
|
||
\centering
|
||
\begin{tabular}{lrrl}
|
||
\toprule
|
||
\textbf{Device} & \textbf{TDP} & \textbf{Energy/Inf} & \textbf{Use Case} \\
|
||
\midrule
|
||
nRF52840 & 15\,mW & $\sim$50\,$\mu$J & Keyword spotting \\
|
||
ESP32-S3 & 400\,mW & $\sim$1\,mJ & Person detection \\
|
||
Jetson Orin NX & 25\,W & $\sim$25\,mJ & Object detection \\
|
||
\rowcolor{gray!15}
|
||
H100 SXM & 700\,W & $\sim$50\,J & LLM inference \\
|
||
\bottomrule
|
||
\end{tabular}
|
||
|
||
\vspace{1em}
|
||
\begin{itemize}
|
||
\item 6 orders of magnitude in energy per inference
|
||
\item TinyML: battery life is the primary constraint
|
||
\item Duty cycling (sleep 99\% of the time) extends battery to years
|
||
\end{itemize}
|
||
\end{frame}
|
||
|
||
% --- 7.5 Live Demo: Hardware Comparison ---
|
||
\begin{frame}[fragile]{Live Demo: nRF52840 vs ESP32 vs H100}
|
||
\note{[3 min] Run live. nRF52840 is memory-bound (Flash). H100 finishes in microseconds.}
|
||
\begin{lstlisting}
|
||
tiny_model = mlsysim.Models.Tiny.KeywordSpotting
|
||
devices = [mlsysim.Hardware.Tiny.nRF52840,
|
||
mlsysim.Hardware.Tiny.ESP32_S3,
|
||
mlsysim.Hardware.Cloud.H100]
|
||
for d in devices:
|
||
p = mlsysim.Engine.solve(tiny_model, d,
|
||
precision="int8", efficiency=0.3)
|
||
print(f"{d.name:>15}: {p.latency.to('ms'):.2f~P}"
|
||
f" | {p.bottleneck:<8}")
|
||
\end{lstlisting}
|
||
|
||
\vspace{0.3em}
|
||
\begin{exampleblock}{Observation}
|
||
nRF52840 is memory-bound (Flash bottleneck).
|
||
H100 finishes in $\mu$s---but wastes 700\,W doing it.
|
||
\textbf{Right-sizing hardware matters.}
|
||
mlsysim also supports \textbf{Coral Edge TPU}, Jetson Orin, and Inferentia2.
|
||
\end{exampleblock}
|
||
\end{frame}
|
||
|
||
% --- 7.6 Same Roofline, Different Physics ---
|
||
\begin{frame}[fragile]{Same Roofline, Different Physics}
|
||
\note{[2 min] Both are memory-bound at batch size 1. Same equation, same diagnosis.}
|
||
\begin{columns}[T]
|
||
\column{0.5\textwidth}
|
||
\begin{center}
|
||
\[
|
||
\text{Latency} = \max\!\left(
|
||
\frac{\text{FLOPs}}{\text{Peak} \times \eta},\;
|
||
\frac{\text{Weights}}{\text{BW}}
|
||
\right)
|
||
\]
|
||
\end{center}
|
||
|
||
\vspace{0.3em}
|
||
\scriptsize
|
||
\begin{tabular}{lll}
|
||
\toprule
|
||
\textbf{Term} & \textbf{Cloud} & \textbf{TinyML} \\
|
||
\midrule
|
||
Peak & 989\,TF & 500\,MF \\
|
||
BW & 3.35\,TB/s & 80\,MB/s \\
|
||
Overhead & 0.01\,ms & 1.0\,ms \\
|
||
\bottomrule
|
||
\end{tabular}
|
||
|
||
\column{0.47\textwidth}
|
||
\begin{lstlisting}
|
||
# Same API, different hardware
|
||
for d in [mlsysim.Hardware.Cloud.H100,
|
||
mlsysim.Hardware.Tiny.ESP32_S3]:
|
||
p = mlsysim.Engine.solve(
|
||
mlsysim.Models.Tiny
|
||
.KeywordSpotting,
|
||
d, precision="int8")
|
||
print(f"{d.name}: {p.bottleneck}")
|
||
\end{lstlisting}
|
||
\end{columns}
|
||
|
||
\vspace{0.3em}
|
||
\centering
|
||
\small
|
||
\textbf{Both are memory-bound at batch size 1.} Same equation, same diagnosis.
|
||
\end{frame}
|
||
|
||
% --- 7.7 Exercise ---
|
||
\begin{frame}[fragile]{Exercise: Right-Sizing for IoT}
|
||
\begin{alertblock}{Your Task (5 minutes)}
|
||
A smart doorbell needs person detection at $<$100\,ms latency on a
|
||
coin-cell battery (500\,mAh @ 3V = 1.5\,Wh). Compare ESP32-S3 vs
|
||
nRF52840: which device can run inference for a full year at 1 inference/minute?
|
||
\end{alertblock}
|
||
|
||
\begin{lstlisting}
|
||
import mlsysim
|
||
|
||
model = mlsysim.Models.Tiny.PersonDetection
|
||
for hw in [mlsysim.Hardware.Tiny.ESP32_S3,
|
||
mlsysim.Hardware.Tiny.nRF52840]:
|
||
p = mlsysim.Engine.solve(model, hw,
|
||
precision="int8", efficiency=0.3)
|
||
energy_j = p.energy.to("J").magnitude
|
||
inferences_per_year = 60 * 24 * 365
|
||
# YOUR CODE: total energy vs 1.5 Wh battery
|
||
\end{lstlisting}
|
||
|
||
\note{[5 min] Expected: nRF52840 at 15\,mW lasts $>$1 year with duty cycling.
|
||
ESP32-S3 at 400\,mW drains the battery in weeks.
|
||
Turn to your neighbor: did you get the same answer? Why or why not? 60 seconds.}
|
||
\end{frame}
|
||
|
||
% --- 7.8 Takeaway ---
|
||
\begin{frame}{Key Takeaway: TinyML to Frontier}
|
||
\note{[1 min] Three points. Right-size hardware to the workload.}
|
||
\begin{center}
|
||
\Large
|
||
\begin{enumerate}
|
||
\item The Roofline model spans \textbf{9 orders of magnitude}
|
||
\item \textbf{Right-size} hardware to the workload, not the other way around
|
||
\item At TinyML scale, \textbf{energy and memory} dominate; at cloud scale, \textbf{cost and communication}
|
||
\end{enumerate}
|
||
|
||
\vspace{1em}
|
||
\normalsize
|
||
\textit{``The best accelerator is the smallest one\\
|
||
that meets your latency and accuracy SLA.''}
|
||
\end{center}
|
||
\end{frame}
|
||
|
||
|
||
% =====================================================================
|
||
% PART 8: ADVANCED TOPICS
|
||
% =====================================================================
|
||
\section{Advanced Topics}
|
||
|
||
% --- 8.1 Key Question ---
|
||
\begin{frame}{Key Question}
|
||
\note{[1 min] Single-wall analysis is not enough. You need the full pipeline.}
|
||
\begin{center}
|
||
\Large\bfseries
|
||
How do you compose multiple analytical models\\[4pt]
|
||
into an end-to-end system analysis?
|
||
\end{center}
|
||
\vfill
|
||
\small
|
||
Single-wall analysis answers ``what is the bottleneck?''\\
|
||
\textbf{Pipeline composition} answers ``what is the total cost of the whole stack?''
|
||
\end{frame}
|
||
|
||
% --- 8.2 Pipeline Composition Pattern ---
|
||
\begin{frame}[fragile]{The Pipeline Composition Pattern}
|
||
\note{[3 min] Show the Pipeline pattern. Each stage feeds the next.}
|
||
\begin{lstlisting}
|
||
from mlsysim.core.pipeline import Pipeline
|
||
from mlsysim.core.solver import (
|
||
ScalingModel, DistributedModel, EconomicsModel)
|
||
|
||
# Chain: Algorithm -> Fleet -> Economics
|
||
pipe = Pipeline([
|
||
ScalingModel(), # Wall 11: compute budget
|
||
DistributedModel(), # Wall 14: comms overhead
|
||
EconomicsModel(), # Wall 17: total cost
|
||
])
|
||
print(pipe.explain()) # shows DAG + gaps
|
||
\end{lstlisting}
|
||
|
||
\vspace{0.5em}
|
||
\texttt{explain()} shows:
|
||
\begin{itemize}
|
||
\item What each stage requires and produces
|
||
\item Which walls from the 22-wall taxonomy are covered
|
||
\item Where gaps exist (missing inputs)
|
||
\end{itemize}
|
||
\end{frame}
|
||
|
||
% --- 8.3 Pipeline Run ---
|
||
\begin{frame}[fragile]{Running the Pipeline}
|
||
\note{[3 min] Run live. Show all stages and their outputs.}
|
||
\begin{lstlisting}
|
||
from mlsysim.core.constants import Q_
|
||
|
||
results = pipe.run(
|
||
compute_budget=Q_("1e21 FLOP"),
|
||
fleet=mlsysim.Systems.Clusters.Research_256,
|
||
duration_days=30,
|
||
datacenter=mlsysim.Infra.Grids.Quebec,
|
||
mfu=0.45,
|
||
infrastructure_multiplier=2.5)
|
||
|
||
for stage_name, result in results.items():
|
||
print(f"\n--- {stage_name} ---")
|
||
print(result)
|
||
\end{lstlisting}
|
||
|
||
\vspace{0.5em}
|
||
\textbf{Key design principle:} Each resolver's output fields become available
|
||
as inputs to subsequent stages. The pipeline is \emph{not} a black box---it
|
||
is a transparent analytical tool.
|
||
\end{frame}
|
||
|
||
% --- 8.4 Sensitivity Analysis ---
|
||
\begin{frame}[fragile]{Wall 21: Sensitivity Analysis}
|
||
\note{[3 min] ``Which knob should I turn next?'' The parameter with the largest partial derivative.}
|
||
\begin{lstlisting}
|
||
solver = mlsysim.SensitivitySolver()
|
||
result = solver.solve(
|
||
model=llama, hardware=hw, precision="fp16",
|
||
perturbation_pct=10.0, efficiency=0.5)
|
||
print(f"Binding: {result.binding_constraint}")
|
||
for p, s in result.sensitivities.items():
|
||
tag = "<<<" if p == result.binding_constraint else ""
|
||
print(f" {p:>20}: {s:+.4f} {tag}")
|
||
\end{lstlisting}
|
||
|
||
\vspace{0.3em}
|
||
\small
|
||
\textbf{Rule:} Invest in the parameter with the \emph{largest} $|\partial T / \partial p|$.\\
|
||
Improving a non-binding parameter yields \textbf{zero} measurable gain.
|
||
\end{frame}
|
||
|
||
% --- 8.5 Inverse Roofline (Synthesis) ---
|
||
\begin{frame}[fragile]{Wall 22: Inverse Roofline (SynthesisSolver)}
|
||
\note{[3 min] Most powerful move: derive hardware requirements from SLA. ``I need 50ms. What hardware?''}
|
||
\begin{lstlisting}
|
||
from mlsysim.core.constants import Q_
|
||
|
||
solver = mlsysim.SynthesisSolver()
|
||
result = solver.solve(
|
||
model=llama, target_latency=Q_("50 ms"),
|
||
precision="fp16", efficiency=0.5)
|
||
print(f"Required BW: {result.required_bw:.0f~P}")
|
||
print(f"Required FLOPS: {result.required_flops:.1f~P}")
|
||
\end{lstlisting}
|
||
|
||
\vspace{0.5em}
|
||
\textbf{Usage:} ``I need 50\,ms TTFT. What hardware do I need?''\\
|
||
$\Rightarrow$ Derive specs from SLA, then match to real accelerators.
|
||
\end{frame}
|
||
|
||
% --- 8.6 Fallacies & Pitfalls ---
|
||
\begin{frame}{Fallacies \& Pitfalls (Patterson Tradition)}
|
||
\note{[2 min] Walk through each fallacy with quantitative rebuttal.}
|
||
\begin{block}{Fallacy: ``Cheaper hardware is always more cost-effective''}
|
||
Reality: A 2$\times$ cheaper GPU that takes 3$\times$ longer has \emph{higher} TCO.\\
|
||
$\text{TCO} = \text{CapEx} + \text{OpEx}$; slower hardware burns more electricity.
|
||
\end{block}
|
||
|
||
\begin{block}{Fallacy: ``More FLOPS = proportionally faster''}
|
||
Reality: A100 $\to$ H100 is 6.3$\times$ peak FLOPS but only $\sim$2$\times$ for
|
||
memory-bound LLM inference. The binding constraint is bandwidth, not compute.
|
||
\end{block}
|
||
|
||
\begin{block}{Pitfall: Optimizing a non-binding parameter}
|
||
If inference is memory-bound, doubling FLOPS gives 0\% speedup.\\
|
||
Use \texttt{SensitivitySolver} to find the binding constraint \emph{first}.
|
||
\end{block}
|
||
|
||
\begin{block}{Pitfall: Ignoring the infrastructure multiplier}
|
||
GPU cost is 40\% of total datacenter TCO. Budgeting for GPUs alone\\
|
||
underestimates the true cost by 2--3$\times$.
|
||
\end{block}
|
||
\end{frame}
|
||
|
||
% --- 8.7 The Iron Law Revisited ---
|
||
\begin{frame}{The Iron Law Revisited: All Five Terms}
|
||
\note{[2 min] Return to the master equation. Every wall maps to one of these five terms.}
|
||
\begin{center}
|
||
\Large
|
||
\[
|
||
T = \frac{\text{FLOPs}}{N \times \text{Peak} \times \text{MFU} \times \eta_{\text{scale}} \times \text{Goodput}}
|
||
\]
|
||
\end{center}
|
||
|
||
\vspace{0.5em}
|
||
\centering
|
||
\begin{tabular}{lll}
|
||
\toprule
|
||
\textbf{Term} & \textbf{What reduces it} & \textbf{Wall} \\
|
||
\midrule
|
||
$N$ & Budget (buy more GPUs) & Wall 17 \\
|
||
Peak & GPU generation (H100$\to$B200)& Wall 1 \\
|
||
MFU & FlashAttention, kernel fusion & Wall 3 \\
|
||
$\eta_{\text{scale}}$& Network BW, gradient compression & Wall 14 \\
|
||
Goodput & Checkpointing, fault tolerance & Walls 15, 19 \\
|
||
\bottomrule
|
||
\end{tabular}
|
||
|
||
\vspace{0.5em}
|
||
\textbf{Every wall in the 22-wall taxonomy maps to one of these five terms.}\\
|
||
That is the entire framework.
|
||
\end{frame}
|
||
|
||
% --- 8.8 Exercise ---
|
||
\begin{frame}[fragile]{Exercise: Binding Constraint Analysis}
|
||
\note{[5 min] At bs=1, memory bandwidth is binding. At bs=64, peak FLOPS. This IS the Roofline transition.
|
||
Turn to your neighbor: did you get the same answer? Why or why not? 60 seconds.}
|
||
\begin{alertblock}{Your Task (5 minutes)}
|
||
Use \texttt{SensitivitySolver} on Llama-3-8B + H100.
|
||
Which parameter is binding? Now switch to \texttt{batch\_size=64} and
|
||
re-run. Does the binding constraint change?
|
||
\end{alertblock}
|
||
|
||
\begin{lstlisting}
|
||
solver = mlsysim.SensitivitySolver()
|
||
# Batch size 1
|
||
r1 = solver.solve(llama, hw, efficiency=0.5)
|
||
print(f"bs=1: binding = {r1.binding_constraint}")
|
||
# Batch size 64
|
||
p64 = mlsysim.Engine.solve(llama, hw, batch_size=64)
|
||
print(f"bs=64: bottleneck = {p64.bottleneck}")
|
||
\end{lstlisting}
|
||
|
||
\end{frame}
|
||
|
||
|
||
% =====================================================================
|
||
% PART 9: WRAP-UP & FUTURE
|
||
% =====================================================================
|
||
\section{Wrap-up \& Future}
|
||
|
||
% --- 9.1 The 22 Walls in One Slide ---
|
||
\begin{frame}{The 22 Walls of ML Systems}
|
||
\note{[2 min] Full recap. Every wall maps to one Iron Law term.}
|
||
\centering
|
||
\small
|
||
\begin{tabular}{clcl}
|
||
\toprule
|
||
\textbf{\#} & \textbf{Wall} & \textbf{\#} & \textbf{Wall} \\
|
||
\midrule
|
||
1 & Compute & 12 & Reasoning (Emerging) \\
|
||
2 & Memory & 13 & Fidelity (Compression) \\
|
||
3 & Software (MFU) & 14 & Communication \\
|
||
4 & Serving & 15 & Fragility (Reliability) \\
|
||
5 & Batching (KV) & 16 & Multi-tenant (Queueing) \\
|
||
6 & Streaming & 17 & Capital (TCO) \\
|
||
7 & Tail Latency & 18 & Sustainability \\
|
||
8 & Ingestion & 19 & Checkpoint \\
|
||
9 & Transformation & 20 & Safety (Privacy) \\
|
||
10 & Locality & 21 & Sensitivity \\
|
||
11 & Complexity & 22 & Synthesis (Inverse) \\
|
||
\bottomrule
|
||
\end{tabular}
|
||
|
||
\vspace{0.5em}
|
||
\textbf{6 Domains:} Node $\cdot$ Data $\cdot$ Algorithm $\cdot$ Fleet $\cdot$ Operations $\cdot$ Meta-Analysis
|
||
|
||
\vspace{0.3em}
|
||
\alert{Every wall maps to one term in the Iron Law.}
|
||
\end{frame}
|
||
|
||
% --- 9.2 What This Tool Does NOT Model ---
|
||
\begin{frame}{What \texttt{mlsysim} Does \emph{Not} Model}
|
||
\note{[2 min] Honest about limitations. Not a simulator, not a profiler. A reasoning framework.}
|
||
\begin{columns}[T]
|
||
\column{0.5\textwidth}
|
||
\textbf{Not modeled (v0.1.0):}
|
||
\begin{itemize}
|
||
\item Cache effects / tiling / fusion
|
||
\item Real network congestion / jitter
|
||
\item CUDA kernel scheduling
|
||
\item Multi-model co-location
|
||
\item Compiler optimizations
|
||
\end{itemize}
|
||
|
||
\column{0.5\textwidth}
|
||
\textbf{By design:}
|
||
\begin{itemize}
|
||
\item Not a cycle-accurate simulator
|
||
\item Not a profiler replacement
|
||
\item Not a deployment tool
|
||
\item \textbf{Is:} a first-principles\\
|
||
analytical reasoning tool
|
||
\end{itemize}
|
||
|
||
\vspace{0.5em}
|
||
\textbf{Use it for:}
|
||
\begin{itemize}
|
||
\item ``Which constraint is binding?''
|
||
\item ``Is this hardware sufficient?''
|
||
\item ``What should I try first?''
|
||
\end{itemize}
|
||
\end{columns}
|
||
|
||
\vspace{0.5em}
|
||
\centering
|
||
\small\itshape
|
||
``All models are wrong, but some are useful.'' --- George Box
|
||
\end{frame}
|
||
|
||
% --- 9.3 v0.2.0 Roadmap ---
|
||
\begin{frame}{v0.2.0 Roadmap}
|
||
\note{[2 min] Quick overview. Contributions welcome.}
|
||
\small
|
||
\begin{columns}[T]
|
||
\begin{column}{0.48\textwidth}
|
||
\textbf{Models \& Hardware}
|
||
\begin{enumerate}
|
||
\item \textbf{MoE support} --- Expert routing + parallelism
|
||
\item \textbf{GQA / MQA} --- Grouped-query KV cache
|
||
\item \textbf{Network congestion} --- Contention-aware collectives
|
||
\end{enumerate}
|
||
\end{column}
|
||
\begin{column}{0.48\textwidth}
|
||
\textbf{Tooling \& Integration}
|
||
\begin{enumerate}\setcounter{enumi}{3}
|
||
\item \textbf{Spot pricing} --- Cloud cost optimization
|
||
\item \textbf{Marimo dashboards} --- Live exploration
|
||
\item \textbf{HuggingFace Hub} --- Auto-import models
|
||
\end{enumerate}
|
||
\end{column}
|
||
\end{columns}
|
||
|
||
\vspace{0.5em}
|
||
\begin{center}
|
||
Contributions welcome: \texttt{github.com/harvard-edge/mlsysim}
|
||
\end{center}
|
||
\end{frame}
|
||
|
||
% --- 9.4 Design Challenge (Capstone) ---
|
||
\begin{frame}[fragile]{Design Challenge: Capstone Exercise}
|
||
\begin{alertblock}{The Problem}
|
||
\textbf{\$5M budget.} Serve Llama-3 70B at \textbf{1{,}000 QPS} with
|
||
\textbf{$<$100\,ms TTFT} in \textbf{two regions} (US-East + EU-West).
|
||
Design the fleet.
|
||
\end{alertblock}
|
||
|
||
\vspace{0.3em}
|
||
\begin{columns}[T]
|
||
\column{0.48\textwidth}
|
||
\textbf{You must specify:}
|
||
\begin{enumerate}\footnotesize
|
||
\item Hardware choice (which GPU? how many?)
|
||
\item Parallelism strategy (TP $\times$ PP)
|
||
\item Precision (FP16? INT8? FP8?)
|
||
\item Geographic placement (carbon?)
|
||
\item Redundancy (replicas per region?)
|
||
\end{enumerate}
|
||
|
||
\column{0.49\textwidth}
|
||
\begin{lstlisting}
|
||
import mlsysim
|
||
|
||
# Capstone starter code
|
||
model = mlsysim.Models.Language.Llama3_70B
|
||
hw = mlsysim.Hardware.H100
|
||
|
||
# Step 1: Can it fit?
|
||
p = mlsysim.Engine.solve(
|
||
model, hw, precision="fp16")
|
||
print(f"Feasible: {p.feasible}")
|
||
print(f"Latency: {p.latency:~P}")
|
||
|
||
# Step 2: TCO for your fleet
|
||
econ = mlsysim.EconomicsModel()
|
||
# YOUR CODE: design the fleet
|
||
\end{lstlisting}
|
||
\end{columns}
|
||
|
||
\note{[35 min] This is intentionally under-specified. Students must make and defend
|
||
assumptions. Key insight: 70B at FP16 = 140\,GB, needs tensor parallelism
|
||
across 2+ H100s per replica. At 1000 QPS with 100\,ms TTFT, you need many
|
||
replicas. Budget constrains how many.}
|
||
\end{frame}
|
||
|
||
% --- 9.4b: Personal Transfer Moment ---
|
||
\begin{frame}{Your Turn: Name That Wall}
|
||
\note{[3 min] THE TRANSFER MOMENT. Silent writing. Then 2-3 volunteers share.}
|
||
\centering
|
||
\Large
|
||
\textbf{Think of one ML system in your own work.}\\[1cm]
|
||
\normalsize
|
||
Write down:\\[0.3cm]
|
||
\begin{enumerate}
|
||
\item What is the system? (model + hardware + use case)
|
||
\item Which of the 22 walls is the \textbf{binding constraint}?
|
||
\item What would you change to move the wall?
|
||
\end{enumerate}
|
||
|
||
\vfill
|
||
\small\textcolor{gray}{2 minutes. Then we will hear from 3 volunteers.}
|
||
\end{frame}
|
||
|
||
% --- 9.5 Resources & Q&A ---
|
||
\begin{frame}{Resources \& Next Steps}
|
||
\note{[3 min] Point to: mlsysim GitHub, textbook, cheatsheet. ``Take a photo of the cheatsheet.''}
|
||
\begin{columns}[T]
|
||
\column{0.55\textwidth}
|
||
\textbf{Get Started}
|
||
\begin{itemize}
|
||
\item \texttt{pip install mlsysim}
|
||
\item GitHub: \texttt{harvard-edge/mlsysim}
|
||
\item 14 examples + 3 Marimo notebooks
|
||
\item Full docs: \texttt{mlsysim.readthedocs.io}
|
||
\end{itemize}
|
||
|
||
\vspace{0.5em}
|
||
\textbf{The Textbook}
|
||
\begin{itemize}
|
||
\item \emph{Machine Learning Systems}
|
||
\item Volume I: Foundations (single node)
|
||
\item Volume II: Systems at Scale (fleet)
|
||
\item \texttt{mlsysbook.ai}
|
||
\end{itemize}
|
||
|
||
\column{0.40\textwidth}
|
||
\textbf{Key Papers}
|
||
\begin{itemize}
|
||
\item Williams et al.\ (2009)\\
|
||
{\footnotesize Roofline Model}
|
||
\item Chowdhery et al.\ (2022)\\
|
||
{\footnotesize PaLM / MFU}
|
||
\item Hoffmann et al.\ (2022)\\
|
||
{\footnotesize Chinchilla Scaling}
|
||
\item Patterson et al.\ (2021)\\
|
||
{\footnotesize Carbon \& Training}
|
||
\item Kwon et al.\ (2023)\\
|
||
{\footnotesize PagedAttention}
|
||
\end{itemize}
|
||
\end{columns}
|
||
|
||
\vspace{1em}
|
||
\begin{center}
|
||
\Large\bfseries Thank you! Questions?
|
||
\end{center}
|
||
\end{frame}
|
||
|
||
\end{document}
|