mirror of
https://github.com/harvard-edge/cs249r_book.git
synced 2026-05-06 17:49:07 -05:00
2171 lines
66 KiB
TeX
2171 lines
66 KiB
TeX
% =============================================================================
|
||
% MLSys·im Tutorial Tutorial — Parts 0–4 (Morning Session)
|
||
% =============================================================================
|
||
\documentclass[aspectratio=169, 12pt]{beamer}
|
||
\usepackage{../../../slides/assets/beamerthememlsys}
|
||
|
||
\mlsyssetup{
|
||
volume = {Tutorial},
|
||
chapter = {Tutorial},
|
||
logo = {../../../slides/assets/img/logo-mlsysbook.png},
|
||
instlogo = {../../../slides/assets/img/logo-harvard.png},
|
||
chaptertitle = {MLSys·im: First-Principles ML Systems Modeling},
|
||
}
|
||
|
||
% --- Fonts ---
|
||
\usepackage{fontspec}
|
||
\setsansfont{Helvetica Neue}[
|
||
BoldFont={Helvetica Neue Bold},
|
||
ItalicFont={Helvetica Neue Italic},
|
||
BoldItalicFont={Helvetica Neue Bold Italic},
|
||
]
|
||
% Use Courier if JetBrains Mono not available
|
||
\IfFontExistsTF{JetBrains Mono}{
|
||
\setmonofont{JetBrains Mono}[Scale=0.85]
|
||
}{
|
||
\setmonofont{Courier New}[Scale=0.90]
|
||
}
|
||
|
||
% --- Packages ---
|
||
\usepackage{amsmath}
|
||
\usepackage{booktabs}
|
||
\usepackage[table]{xcolor}
|
||
\usepackage{listings}
|
||
\usepackage{tikz}
|
||
\usetikzlibrary{arrows.meta, positioning, calc, decorations.pathreplacing}
|
||
|
||
% --- Code listings ---
|
||
\lstset{
|
||
language=Python,
|
||
basicstyle=\ttfamily\footnotesize,
|
||
keywordstyle=\color{crimson}\bfseries,
|
||
stringstyle=\color{datastroke},
|
||
commentstyle=\color{midgray}\itshape,
|
||
backgroundcolor=\color{computeblue!20},
|
||
frame=single,
|
||
rulecolor=\color{computestroke},
|
||
numbers=none,
|
||
breaklines=true,
|
||
columns=fullflexible,
|
||
keepspaces=true,
|
||
showstringspaces=false,
|
||
xleftmargin=4pt,
|
||
xrightmargin=4pt,
|
||
aboveskip=6pt,
|
||
belowskip=4pt,
|
||
}
|
||
|
||
% --- Convenience macros ---
|
||
\newcommand{\mlsysim}{\texttt{mlsysim}}
|
||
\newcommand{\wallbox}[2]{%
|
||
\begin{block}{#1}#2\end{block}%
|
||
}
|
||
\newcommand{\PredictStart}{\begin{alertblock}{Predict Before You Peek}}
|
||
\newcommand{\PredictEnd}{\end{alertblock}}
|
||
|
||
% --- Image paths ---
|
||
\graphicspath{{images/}}
|
||
|
||
% --- Section count (must match actual \section{} count) ---
|
||
\setcounter{mlsystotalsections}{6}
|
||
|
||
\title{MLSys·im: First-Principles ML Systems Modeling}
|
||
\subtitle{A Hands-On Tutorial}
|
||
\author{Vijay Janapa Reddi}
|
||
\institute{Harvard University}
|
||
\date{Tutorial}
|
||
|
||
% =============================================================================
|
||
\begin{document}
|
||
|
||
% =============================================================================
|
||
% PART 0: WELCOME & SETUP (5 slides)
|
||
% =============================================================================
|
||
\section{Welcome \& Setup}
|
||
|
||
% --- Slide 0.1: Title ---
|
||
\begin{frame}
|
||
\note{[1 min] Welcome attendees, set the tone.
|
||
Welcome to the MLSys-im tutorial. Today we will build quantitative
|
||
intuition for ML systems from first principles.
|
||
% -- FLEX: [CORE] Title slide --- do not skip.
|
||
}
|
||
\titlepage
|
||
\end{frame}
|
||
|
||
% --- Slide 0.1b: The $200M Question ---
|
||
\begin{frame}{The \$200 Million Question}
|
||
\note{[3 min] THE HOOK. Open strong. Don't touch your laptop. Look at the audience.}
|
||
|
||
\centering
|
||
\Large
|
||
\textbf{Meta spent \$200M training Llama-3-405B.}\\[1cm]
|
||
|
||
\normalsize
|
||
Before a single GPU was purchased:\\[0.3cm]
|
||
\begin{itemize}
|
||
\item How would you know \textbf{16,384 H100s} was the right fleet?
|
||
\item How would you know \textbf{405B parameters} was the right model size?
|
||
\item How would you know it would take \textbf{54 days}, not 540?
|
||
\end{itemize}
|
||
|
||
\vfill
|
||
\small\textcolor{gray}{We will answer all three questions today --- on your laptop, in under a second, with no GPU.}
|
||
\end{frame}
|
||
|
||
% --- Slide 0.1c: Live Demo Reveal ---
|
||
\begin{frame}[fragile]{Answer in 0.1 Seconds}
|
||
\note{[2 min] Run this LIVE. The room should gasp at how fast the answer appears.}
|
||
|
||
\begin{lstlisting}
|
||
import mlsysim
|
||
|
||
profile = mlsysim.Engine.solve(
|
||
mlsysim.Models.Language.Llama3_8B,
|
||
mlsysim.Hardware.Cloud.H100,
|
||
batch_size=1,
|
||
)
|
||
print(f"Bottleneck: {profile.bottleneck}") # Memory
|
||
print(f"MFU: {profile.mfu:.3f}") # 0.003
|
||
\end{lstlisting}
|
||
|
||
\vfill
|
||
\centering
|
||
\textbf{That took 0.1 seconds. On a laptop. No GPU.}\\[0.2cm]
|
||
\small Now imagine doing this for every hardware option, every model size,\\
|
||
every parallelism strategy, every region. \textbf{That is mlsysim.}
|
||
\end{frame}
|
||
|
||
% --- Slide 0.2: What You Will Learn Today ---
|
||
\begin{frame}{What You Will Learn Today}
|
||
\note{[2 min] Walk through objectives quickly. Emphasize that by the
|
||
end of the day every attendee will be able to do these five things.
|
||
% -- FLEX: [CORE]
|
||
}
|
||
|
||
\small
|
||
By the end of this tutorial you will be able to:
|
||
\begin{enumerate}
|
||
\item \textbf{Identify} which physical constraint is the binding bottleneck
|
||
for any ML workload on any hardware.
|
||
\item \textbf{Decompose} training and inference time using the Iron Law.
|
||
\item \textbf{Compare} hardware configurations quantitatively with \mlsysim.
|
||
\item \textbf{Reason} about the compute--memory--communication tradeoff space.
|
||
\item \textbf{Estimate} TCO and carbon footprint for a real deployment.
|
||
\end{enumerate}
|
||
|
||
\vspace{0.3cm}
|
||
\centering
|
||
\textit{All you need is a laptop and} \texttt{pip install mlsysim}
|
||
\end{frame}
|
||
|
||
% --- Slide 0.3: Setup Check ---
|
||
\begin{frame}[fragile]{Setup: Install \& Verify}
|
||
\note{[3 min] Give attendees 2 minutes to run these commands.
|
||
Walk around and help anyone with pip issues.
|
||
If someone cannot install, they can pair with a neighbor.
|
||
% -- FLEX: [CORE] --- must verify before proceeding.
|
||
}
|
||
|
||
\small
|
||
Open a terminal and run:
|
||
|
||
\begin{lstlisting}
|
||
pip install mlsysim
|
||
python3 -c "import mlsysim; print(mlsysim.__version__)"
|
||
# Expected output: 0.1.0
|
||
\end{lstlisting}
|
||
|
||
\vspace{0.3cm}
|
||
Then run the hello-world sanity check:
|
||
|
||
\begin{lstlisting}
|
||
import mlsysim
|
||
model = mlsysim.Models.Language.Llama3_8B
|
||
hw = mlsysim.Hardware.Cloud.H100
|
||
prof = mlsysim.Engine.solve(model, hw, batch_size=1)
|
||
print(prof.bottleneck) # -> "Memory"
|
||
\end{lstlisting}
|
||
|
||
\vspace{0.2cm}
|
||
\centering
|
||
\alert{If you see \texttt{Memory}, you are ready.}
|
||
|
||
\vspace{0.3cm}
|
||
\footnotesize
|
||
\textit{Convention for the rest of the day:}\\
|
||
\texttt{import mlsysim} is assumed.
|
||
We use \texttt{llama} $=$ \texttt{mlsysim.Models.Language.Llama3\_8B}
|
||
and \texttt{hw} $=$ \texttt{mlsysim.Hardware.Cloud.H100} as shorthands.
|
||
\end{frame}
|
||
|
||
% --- Slide 0.4: The 22-Wall Taxonomy ---
|
||
\begin{frame}{The 22 Physical Walls of ML Systems}
|
||
\note{[2 min] This is the road map for the day. Point out that
|
||
we will hit walls 1--7 (Node) before lunch and walls 8--22
|
||
after lunch. Each wall has one equation and one mlsysim solver.
|
||
Ask: ``How many of these walls have you personally hit?'' Show of hands.
|
||
% -- FLEX: [CORE]
|
||
}
|
||
|
||
\small
|
||
\begin{columns}[T]
|
||
\begin{column}{0.48\textwidth}
|
||
\textbf{Domain 1: Node}\\[2pt]
|
||
\begin{enumerate}\setlength\itemsep{1pt}
|
||
\item Compute Wall
|
||
\item Memory Wall
|
||
\item Software Wall (MFU)
|
||
\item Serving Wall
|
||
\item Batching Wall (KV cache)
|
||
\item Streaming Wall
|
||
\item Tail Latency Wall
|
||
\end{enumerate}
|
||
|
||
\vspace{0.2cm}
|
||
\textbf{Domain 2: Data}\\[2pt]
|
||
\begin{enumerate}\setlength\itemsep{1pt}\setcounter{enumi}{7}
|
||
\item Ingestion Wall
|
||
\item Transformation Wall
|
||
\item Locality Wall
|
||
\end{enumerate}
|
||
\end{column}
|
||
|
||
\begin{column}{0.48\textwidth}
|
||
\textbf{Domain 3: Algorithm}\\[2pt]
|
||
\begin{enumerate}\setlength\itemsep{1pt}\setcounter{enumi}{10}
|
||
\item Complexity Wall (Chinchilla)
|
||
\item Reasoning Wall
|
||
\item Fidelity Wall (Compression)
|
||
\end{enumerate}
|
||
|
||
\vspace{0.2cm}
|
||
\textbf{Domain 4: Fleet}\\[2pt]
|
||
\begin{enumerate}\setlength\itemsep{1pt}\setcounter{enumi}{13}
|
||
\item Communication Wall
|
||
\item Fragility Wall
|
||
\item Multi-Tenant Wall
|
||
\end{enumerate}
|
||
|
||
\vspace{0.2cm}
|
||
\textbf{Domain 5: Operations}\\[2pt]
|
||
\begin{enumerate}\setlength\itemsep{1pt}\setcounter{enumi}{16}
|
||
\item Capital Wall (TCO)
|
||
\item Sustainability Wall
|
||
\item Checkpoint Wall
|
||
\item Safety Wall
|
||
\end{enumerate}
|
||
\end{column}
|
||
\end{columns}
|
||
\end{frame}
|
||
|
||
% --- Slide 0.5: The Iron Law (Preview) ---
|
||
\mlsysfocus{The Iron Law of ML Systems}{%
|
||
\[
|
||
T = \frac{\text{FLOPs}}{N \;\times\; \text{Peak} \;\times\; \text{MFU} \;\times\; \eta_{\text{scaling}} \;\times\; \text{Goodput}}
|
||
\]
|
||
\\[0.5cm]
|
||
\normalsize
|
||
Every wall maps to one of these five denominator terms.\\
|
||
This single equation is our compass for the entire day.
|
||
}
|
||
|
||
% =============================================================================
|
||
% RELATED WORK & POSITIONING (8 slides)
|
||
% =============================================================================
|
||
\input{related_work}
|
||
|
||
% --- Roadmap: You Are Here (Morning) ---
|
||
\begin{frame}{Roadmap: You Are Here}
|
||
\note{[1 min] Quick orientation. We just finished the setup. Now the real work begins.}
|
||
|
||
\centering\small
|
||
\begin{tabular}{rll}
|
||
\toprule
|
||
\textbf{Time} & \textbf{Part} & \textbf{Status} \\
|
||
\midrule
|
||
9:00--9:30 & Part 0: Welcome \& Setup & \checkmark Done \\
|
||
\rowcolor{crimson!12}
|
||
9:30--10:30 & \textbf{Part 1: Iron Law \& Roofline} & \textbf{$\leftarrow$ You are here} \\
|
||
10:45--11:45 & Part 2: Memory Walls \& Serving & \\
|
||
11:45--12:00 & Part 3: Compression & \\
|
||
\midrule
|
||
\textit{12:00--1:00} & \textit{Lunch} & \\
|
||
\midrule
|
||
1:00--2:15 & Part 4: Going Distributed & \\
|
||
2:30--3:15 & Part 5: Economics \& Sustainability & \\
|
||
3:15--3:45 & Part 6: Design Space Exploration & \\
|
||
3:45--4:15 & Part 7: TinyML to Frontier & \\
|
||
4:15--4:45 & Part 8: Advanced Topics & \\
|
||
4:45--5:00 & Part 9: Wrap-Up \& Capstone & \\
|
||
\bottomrule
|
||
\end{tabular}
|
||
\end{frame}
|
||
|
||
% =============================================================================
|
||
% PART 1: THE IRON LAW & ROOFLINE (15 slides)
|
||
% =============================================================================
|
||
\section{Iron Law \& Roofline}
|
||
|
||
% --- Slide 1.1: Key Question ---
|
||
\begin{frame}{Key Question}
|
||
\note{[1 min] Pose the question dramatically. Pause for 5 seconds.
|
||
``This is the most important question in ML systems engineering.
|
||
By the end of this section you will answer it in 3 lines of Python.''
|
||
% -- FLEX: [CORE]
|
||
}
|
||
|
||
\centering
|
||
\Large\bfseries
|
||
Why doesn't doubling FLOPS\\[0.3cm]
|
||
double your throughput?
|
||
\end{frame}
|
||
|
||
% --- Slide 1.2: Constraints Drive Architecture ---
|
||
\begin{frame}{Constraints Drive Architecture}
|
||
\note{[2 min] ``You don't choose a Transformer because it's trendy;
|
||
you choose it because of how it parallelizes on real silicon.
|
||
AI is not magic --- it is infrastructure, and infrastructure has laws.''
|
||
% -- FLEX: [CORE]
|
||
}
|
||
|
||
\small
|
||
\begin{itemize}
|
||
\item Hardware has \textbf{finite compute} (FLOPS), \textbf{finite bandwidth}
|
||
(GB/s), and \textbf{finite memory} (GB).
|
||
\item Every workload demands some amount of each.
|
||
\item The \textbf{binding constraint} is the one that takes the longest.
|
||
\item \alert{You optimize the bottleneck, not the fast part.}
|
||
\end{itemize}
|
||
|
||
\vspace{0.5cm}
|
||
\centering
|
||
\begin{tikzpicture}[>=Stealth, node distance=3cm]
|
||
\node[draw, fill=computeblue, rounded corners, minimum width=2.5cm, minimum height=1cm]
|
||
(compute) {\textbf{Compute}};
|
||
\node[draw, fill=datagreen, rounded corners, minimum width=2.5cm, minimum height=1cm,
|
||
right=of compute] (memory) {\textbf{Memory BW}};
|
||
\node[draw, fill=routingorange, rounded corners, minimum width=2.5cm, minimum height=1cm,
|
||
right=of memory] (network) {\textbf{Network}};
|
||
\draw[->, thick, crimson] (compute) -- node[above, font=\scriptsize] {which is slowest?} (memory);
|
||
\draw[->, thick, crimson] (memory) -- node[above, font=\scriptsize] {which is slowest?} (network);
|
||
\end{tikzpicture}
|
||
\end{frame}
|
||
|
||
% --- Slide 1.3: The Roofline Model ---
|
||
\begin{frame}{The Roofline Model (Williams et al., 2009)}
|
||
\note{[3 min] Draw the two regimes on the board. Left = memory-bound,
|
||
right = compute-bound. The ridge point is where they cross.
|
||
``Before I show numbers: if a model does 16B FLOPs and
|
||
loads 16 GB of weights, what is its arithmetic intensity?''
|
||
Expected: 1 FLOP/byte. That is far left on the Roofline.
|
||
WARN: Students conflate FLOPS with throughput.
|
||
% -- FLEX: [CORE]
|
||
}
|
||
|
||
\small
|
||
\[
|
||
\text{Attainable FLOPS} = \min\!\bigl(\text{Peak FLOPS},\;\;
|
||
\text{BW} \times \text{Arithmetic Intensity}\bigr)
|
||
\]
|
||
|
||
\vspace{0.2cm}
|
||
\begin{columns}[T]
|
||
\begin{column}{0.55\textwidth}
|
||
\textbf{Arithmetic Intensity} (AI):
|
||
\[
|
||
\text{AI} = \frac{\text{FLOPs}}{\text{Bytes moved}}
|
||
\;\;\bigl[\text{FLOP/byte}\bigr]
|
||
\]
|
||
|
||
\vspace{0.2cm}
|
||
\begin{itemize}\setlength\itemsep{2pt}
|
||
\item \textbf{AI $<$ Ridge Point} $\Rightarrow$ \colorbox{datagreen}{Memory-bound}
|
||
\item \textbf{AI $>$ Ridge Point} $\Rightarrow$ \colorbox{computeblue}{Compute-bound}
|
||
\end{itemize}
|
||
|
||
\vspace{0.2cm}
|
||
Ridge Point $=$ Peak FLOPS $/$ Peak BW
|
||
\end{column}
|
||
|
||
\begin{column}{0.42\textwidth}
|
||
\centering
|
||
\begin{tikzpicture}[scale=0.7]
|
||
% axes
|
||
\draw[->, thick] (0,0) -- (6.5,0) node[right, font=\scriptsize] {AI (FLOP/B)};
|
||
\draw[->, thick] (0,0) -- (0,4.5) node[above, font=\scriptsize] {GFLOPS};
|
||
% memory roof
|
||
\draw[very thick, datastroke] (0,0) -- (3,3);
|
||
% compute roof
|
||
\draw[very thick, computestroke] (3,3) -- (6.2,3);
|
||
% ridge point
|
||
\fill[crimson] (3,3) circle (3pt);
|
||
\node[above right, font=\scriptsize, crimson] at (3,3) {Ridge};
|
||
% labels
|
||
\node[font=\scriptsize, datastroke, rotate=42] at (1.2,1.7) {BW-limited};
|
||
\node[font=\scriptsize, computestroke] at (4.8,3.4) {Compute-limited};
|
||
\end{tikzpicture}
|
||
\end{column}
|
||
\end{columns}
|
||
\end{frame}
|
||
|
||
% --- Slide 1.4: The Compute Wall ---
|
||
\begin{frame}{Wall 1: The Compute Wall}
|
||
\note{[2 min] ``This is the speed limit. No software trick can make
|
||
your model run faster than the chip can crunch numbers.''
|
||
% -- FLEX: [CORE]
|
||
}
|
||
|
||
\small
|
||
\wallbox{The Compute Wall}{
|
||
\[
|
||
T_{\text{compute}} = \frac{\text{Operations}}{\text{Peak FLOPS} \times \text{Efficiency}}
|
||
\]
|
||
}
|
||
|
||
\vspace{0.2cm}
|
||
\textbf{Example:} ResNet-50 inference at batch 256 on H100
|
||
|
||
\begin{itemize}
|
||
\item FLOPs = $8.0 \times 10^{9} \times 256 = 2.05 \times 10^{12}$
|
||
\item H100 FP16 Peak = 989 TFLOPS
|
||
\item At 50\% MFU: $T = \frac{2.05 \times 10^{12}}{989 \times 10^{12} \times 0.5} \approx 4.1\;\text{ms}$
|
||
\end{itemize}
|
||
|
||
\vspace{0.2cm}
|
||
\alert{The chip is the ceiling. MFU is how close you get to it.}
|
||
\end{frame}
|
||
|
||
% --- Slide 1.5: The Memory Wall ---
|
||
\begin{frame}{Wall 2: The Memory Wall}
|
||
\note{[2 min] ``Quick mental math: 16 GB model, 3.35 TB/s bandwidth.
|
||
How long to load?'' Give 10 seconds. Expected: 16/3350 = 4.8 ms.
|
||
WARN: Students assume compute is always the bottleneck because
|
||
GPUs are marketed on TFLOPS.
|
||
% -- FLEX: [CORE]
|
||
}
|
||
|
||
\small
|
||
\wallbox{The Memory Wall}{
|
||
\[
|
||
T_{\text{memory}} = \frac{\text{Weight Bytes}}{\text{Memory Bandwidth}}
|
||
\]
|
||
}
|
||
|
||
\vspace{0.2cm}
|
||
\textbf{Example:} Llama-3 8B at batch size 1 on H100
|
||
|
||
\begin{itemize}
|
||
\item Weight size (FP16) = $8\text{B} \times 2\;\text{bytes} = 16\;\text{GB}$
|
||
\item H100 HBM3 BW = 3.35 TB/s
|
||
\item $T = \frac{16}{3350} \approx 4.8\;\text{ms}$ just to load weights
|
||
\item Meanwhile, compute finishes in $\sim$0.03 ms
|
||
\end{itemize}
|
||
|
||
\vspace{0.2cm}
|
||
\alert{At batch size 1, LLM inference is $\sim$\,160$\times$ memory-bound.}
|
||
\end{frame}
|
||
|
||
% --- Slide 1.6: Predict Before You Peek #1 ---
|
||
\begin{frame}{Predict: H100 vs MI300X vs Gaudi\,3}
|
||
\note{[3 min] PREDICTION. Give the audience 60 seconds to think.
|
||
Expected answer: all memory-bound. BW ratios determine speedup, not FLOPS.
|
||
After the reveal, hammer home: ``The bottleneck determines the speedup.''
|
||
Turn to your neighbor: did you get the same answer? Why or why not? 60 seconds.
|
||
% -- FLEX: [CORE]
|
||
}
|
||
|
||
\centering
|
||
\Large\bfseries
|
||
Three flagship accelerators. Same workload.\\[0.3cm]
|
||
\normalsize
|
||
Llama-3 8B, batch size 1, FP16 inference.\\
|
||
Which is fastest---and by how much?\\[0.3cm]
|
||
|
||
\pause
|
||
|
||
\scriptsize
|
||
\begin{tabular}{lccc}
|
||
\toprule
|
||
& \textbf{H100 (NVIDIA)} & \textbf{MI300X (AMD)} & \textbf{Gaudi\,3 (Intel)} \\
|
||
\midrule
|
||
Peak FP16 & 989 TFLOPS & 1,307 TFLOPS & 1,835 TFLOPS \\
|
||
HBM BW & 3.35 TB/s & 5.3 TB/s & 3.7 TB/s \\
|
||
HBM Capacity & 80 GB & 192 GB & 128 GB \\
|
||
\midrule
|
||
Bottleneck & Memory & Memory & Memory \\
|
||
Weight-load time & 4.8 ms & 3.0 ms & 4.3 ms \\
|
||
\textbf{Speedup vs H100} & --- & \textbf{1.6$\times$} & \textbf{1.1$\times$} \\
|
||
\bottomrule
|
||
\end{tabular}
|
||
|
||
\vspace{0.3cm}
|
||
\pause
|
||
\small
|
||
\alert{MI300X has fewer FLOPS than Gaudi\,3 but wins on bandwidth.\\
|
||
FLOPS don't determine speed when memory-bound.}
|
||
\end{frame}
|
||
|
||
% --- Slide 1.7: Live Demo --- Engine.solve (multi-vendor) ---
|
||
\begin{frame}[fragile]{Live Demo: Three Vendors, One API}
|
||
\note{[3 min] Run this live. The key moment: all three are memory-bound.
|
||
The ranking follows bandwidth, not FLOPS. This is ISCA---show that
|
||
mlsysim is not an NVIDIA-only tool.
|
||
% -- FLEX: [CORE]
|
||
}
|
||
|
||
\small
|
||
Run this in your Python session:
|
||
|
||
\begin{lstlisting}
|
||
import mlsysim
|
||
|
||
model = mlsysim.Models.Language.Llama3_8B
|
||
for hw_name in ["H100", "MI300X", "Gaudi3"]:
|
||
hw = getattr(mlsysim.Hardware.Cloud, hw_name)
|
||
p = mlsysim.Engine.solve(model, hw, batch_size=1)
|
||
print(f"{hw.name}: {p.bottleneck}, "
|
||
f"{p.latency:.2f}")
|
||
\end{lstlisting}
|
||
|
||
\vspace{0.2cm}
|
||
\begin{exampleblock}{What to look for}
|
||
\begin{itemize}
|
||
\item \texttt{bottleneck}: Memory for \textbf{all three}
|
||
\item Ranking follows BW (MI300X $>$ Gaudi\,3 $>$ H100), not FLOPS
|
||
\item Same API, same physics, different silicon
|
||
\end{itemize}
|
||
\end{exampleblock}
|
||
\end{frame}
|
||
|
||
% --- Slide 1.8: MFU --- The Software Wall ---
|
||
\begin{frame}{Wall 3: MFU --- The Software Wall}
|
||
\note{[2 min] ``MFU measures the gap between what the hardware could do
|
||
and what your software actually achieves. A 50\% MFU means you are
|
||
paying for twice the hardware you are using.''
|
||
% -- FLEX: [CORE]
|
||
}
|
||
|
||
\small
|
||
\wallbox{Model FLOPs Utilization}{
|
||
\[
|
||
\text{MFU} = \frac{\text{Achieved FLOPS}}{\text{Peak FLOPS}}
|
||
\]
|
||
}
|
||
|
||
\vspace{0.2cm}
|
||
\begin{columns}[T]
|
||
\begin{column}{0.55\textwidth}
|
||
\textbf{What eats MFU?}
|
||
\begin{itemize}\setlength\itemsep{2pt}
|
||
\item Kernel launch overhead
|
||
\item Memory stalls (cache misses)
|
||
\item Framework overhead (Python $\to$ CUDA)
|
||
\item Suboptimal operator fusion
|
||
\item \alert{Being memory-bound} (the biggest one!)
|
||
\end{itemize}
|
||
\end{column}
|
||
\begin{column}{0.42\textwidth}
|
||
\centering
|
||
\textbf{Typical MFU ranges}\\[4pt]
|
||
\scriptsize
|
||
\begin{tabular}{lc}
|
||
\toprule
|
||
Workload & MFU \\
|
||
\midrule
|
||
LLM training (optimized) & 40--55\% \\
|
||
LLM inference (bs=1) & $<$5\% \\
|
||
ResNet training & 30--40\% \\
|
||
FlashAttention & 60--75\% \\
|
||
\bottomrule
|
||
\end{tabular}
|
||
\end{column}
|
||
\end{columns}
|
||
|
||
\vspace{0.2cm}
|
||
\alert{Improving MFU is often cheaper than buying more GPUs.}
|
||
\end{frame}
|
||
|
||
% --- Slide 1.8b: What Is Eta? ---
|
||
\begin{frame}{What Is $\eta$? (The Efficiency Parameter)}
|
||
\note{[3 min] CRITICAL — every demo uses eta. Explain it ONCE here, then it's understood for the day.
|
||
ANALOGY: ``eta is to ML systems what CPI is to CPU design --- an empirical constant that bridges peak specs and reality.''}
|
||
|
||
\small
|
||
\textbf{$\eta$ = Achieved FLOPS / Peak FLOPS} (Model FLOPs Utilization)
|
||
|
||
\vspace{0.3cm}
|
||
The gap between what your hardware \emph{could} do and what it \emph{actually} does.
|
||
|
||
\vspace{0.3cm}
|
||
\begin{columns}[T]
|
||
\begin{column}{0.48\textwidth}
|
||
\textbf{What reduces $\eta$:}
|
||
\begin{itemize}\setlength\itemsep{1pt}
|
||
\item Kernel launch overhead
|
||
\item SM occupancy limits
|
||
\item Memory coalescing misses
|
||
\item Framework overhead (Python GIL)
|
||
\item Communication stalls
|
||
\end{itemize}
|
||
\end{column}
|
||
\begin{column}{0.48\textwidth}
|
||
\textbf{Typical values:}
|
||
|
||
\scriptsize
|
||
\begin{tabular}{@{}lr@{}}
|
||
\toprule
|
||
Scenario & $\eta$ \\
|
||
\midrule
|
||
Training (Megatron-LM) & 0.40--0.55 \\
|
||
Training (PyTorch eager) & 0.08--0.15 \\
|
||
Inference decode, bs=1 & 0.01--0.05 \\
|
||
Inference decode, bs=32+ & 0.15--0.35 \\
|
||
Inference prefill & 0.30--0.50 \\
|
||
TinyML (TFLite Micro) & 0.05--0.15 \\
|
||
\bottomrule
|
||
\end{tabular}
|
||
\end{column}
|
||
\end{columns}
|
||
|
||
\vfill
|
||
\centering
|
||
\small\textcolor{gray}{You do not predict $\eta$ --- you measure it once and use it for what-if analysis.}
|
||
\end{frame}
|
||
|
||
% --- Slide 1.9: The Iron Law (Full) ---
|
||
\begin{frame}{The Iron Law of ML Systems}
|
||
\note{[3 min] Walk through each denominator term. Point out that every
|
||
wall in the 22-wall taxonomy maps to exactly one term.
|
||
``Which term do you think is hardest to improve?''
|
||
% -- FLEX: [CORE]
|
||
}
|
||
|
||
\[
|
||
T = \frac{\text{FLOPs}}{N \;\times\; \text{Peak} \;\times\; \text{MFU}
|
||
\;\times\; \eta_{\text{scaling}} \;\times\; \text{Goodput}}
|
||
\]
|
||
|
||
\vspace{0.3cm}
|
||
\small
|
||
\begin{tabular}{llll}
|
||
\toprule
|
||
\textbf{Term} & \textbf{Meaning} & \textbf{Reduced by} & \textbf{Walls} \\
|
||
\midrule
|
||
$N$ & Number of devices & Budget & --- \\
|
||
Peak & Raw hardware speed & GPU generation & 1 (Compute) \\
|
||
MFU & Software efficiency & FlashAttention, fusion & 2--3 \\
|
||
$\eta_{\text{scaling}}$ & Communication loss & BW, gradient compression & 14--16 \\
|
||
Goodput & Failure overhead & Checkpointing, FT & 15, 19 \\
|
||
\bottomrule
|
||
\end{tabular}
|
||
|
||
\vspace{0.3cm}
|
||
\centering
|
||
\textit{Every wall in the taxonomy attacks one of these five terms.}
|
||
\end{frame}
|
||
|
||
% --- Slide 1.10: Arithmetic Intensity Deep Dive ---
|
||
\begin{frame}{Arithmetic Intensity: The Dial You Control}
|
||
\note{[2 min] ``Batch size is the primary knob. Each additional sample
|
||
in the batch reuses the same weights that are already loaded.
|
||
The compute grows linearly but memory stays constant.''
|
||
% -- FLEX: [CORE]
|
||
}
|
||
|
||
\small
|
||
\[
|
||
\text{AI} = \frac{\text{FLOPs}}{\text{Bytes}} \approx
|
||
\frac{2 \times \text{Params} \times B}{
|
||
\underbrace{\text{Params} \times \text{bpp}}_{\text{weights}} +
|
||
\underbrace{\text{Activations}(B)}_{\text{grows with } B}}
|
||
\]
|
||
|
||
\vspace{0.2cm}
|
||
\textbf{The batch-size knob:}
|
||
\begin{itemize}
|
||
\item At $B=1$: AI $\approx$ 1 FLOP/byte $\Rightarrow$ \textbf{memory-bound}
|
||
\item At $B=32$: AI $\approx$ 32 FLOP/byte $\Rightarrow$ approaching ridge
|
||
\item At $B=256$: AI $\gg$ ridge $\Rightarrow$ \textbf{compute-bound}
|
||
\end{itemize}
|
||
|
||
\vspace{0.2cm}
|
||
\centering
|
||
\begin{tikzpicture}[scale=0.65]
|
||
\draw[->, thick] (0,0) -- (7,0) node[right, font=\scriptsize] {Batch size};
|
||
\draw[->, thick] (0,0) -- (0,4) node[above, font=\scriptsize] {Throughput};
|
||
\draw[very thick, datastroke] (0.3,0.3) -- (3,3);
|
||
\draw[very thick, computestroke] (3,3) -- (6.5,3);
|
||
\fill[crimson] (3,3) circle (3pt);
|
||
\node[above, font=\scriptsize, crimson] at (3,3.1) {Ridge};
|
||
\node[below, font=\scriptsize, datastroke] at (1.3,0) {BW-bound};
|
||
\node[below, font=\scriptsize, computestroke] at (5,0) {Compute-bound};
|
||
\end{tikzpicture}
|
||
\end{frame}
|
||
|
||
% --- Slide 1.11: Live Demo --- Batch Size Sweep ---
|
||
\begin{frame}[fragile]{Live Demo: Finding the Ridge Point}
|
||
\note{[3 min] Run this loop live. Show how the bottleneck flips
|
||
from Memory to Compute as batch size increases.
|
||
Before running, ask: ``At what batch size do you
|
||
predict the bottleneck will flip?'' Take guesses.
|
||
% -- FLEX: [CORE]
|
||
}
|
||
|
||
\small
|
||
\begin{lstlisting}
|
||
llama = mlsysim.Models.Language.Llama3_8B
|
||
hw = mlsysim.Hardware.Cloud.H100
|
||
|
||
for bs in [1, 4, 16, 64, 128, 256]:
|
||
p = mlsysim.Engine.solve(llama, hw, batch_size=bs)
|
||
print(f"bs={bs:>3d} {p.bottleneck:<8s} "
|
||
f"MFU={p.mfu:.3f}")
|
||
\end{lstlisting}
|
||
|
||
\vspace{0.3cm}
|
||
\begin{exampleblock}{What to observe}
|
||
\begin{itemize}
|
||
\item Bottleneck flips from \texttt{Memory} to \texttt{Compute}
|
||
\item MFU climbs as batch size increases (better hardware utilization)
|
||
\item Latency grows but throughput (tokens/s) improves
|
||
\end{itemize}
|
||
\end{exampleblock}
|
||
\end{frame}
|
||
|
||
% --- Slide 1.12: Exercise 1 ---
|
||
\begin{frame}[fragile]{Exercise 1: Find the Crossover}
|
||
\note{[5 min] Attendees work individually.
|
||
At what batch size does Llama-3 8B on H100 transition
|
||
from memory-bound to compute-bound?
|
||
Expected: around bs=32--64 depending on precision.
|
||
Turn to your neighbor: did you get the same answer? Why or why not? 60 seconds.
|
||
% -- FLEX: [CORE] --- this is a critical hands-on moment.
|
||
}
|
||
|
||
\centering
|
||
\Large\bfseries
|
||
Hands-On Exercise\\[0.5cm]
|
||
\normalsize
|
||
|
||
\textbf{Question:} At what batch size does Llama-3 8B on H100
|
||
transition from memory-bound to compute-bound?
|
||
|
||
\vspace{0.3cm}
|
||
\begin{lstlisting}
|
||
for bs in range(1, 129):
|
||
p = mlsysim.Engine.solve(llama, hw, batch_size=bs)
|
||
if p.bottleneck == "Compute":
|
||
print(f"Crossover at batch size {bs}")
|
||
break
|
||
\end{lstlisting}
|
||
|
||
\vspace{0.3cm}
|
||
\textit{Bonus: Try the same on A100. Does the crossover happen
|
||
at the same batch size? Why or why not?}
|
||
\end{frame}
|
||
|
||
% --- Slide 1.13: The Ridge Point Explained ---
|
||
\begin{frame}{The Ridge Point: Hardware DNA}
|
||
\note{[2 min] ``The ridge point is a property of the hardware,
|
||
not the workload. It tells you how many FLOPs per byte the chip
|
||
can sustain before compute becomes the ceiling.''
|
||
% -- FLEX: [CORE]
|
||
}
|
||
|
||
\small
|
||
\[
|
||
\text{Ridge Point} = \frac{\text{Peak FLOPS}}{\text{Peak BW}}
|
||
\;\;\bigl[\text{FLOP/byte}\bigr]
|
||
\]
|
||
|
||
\vspace{0.2cm}
|
||
\scriptsize
|
||
\begin{tabular}{llccc}
|
||
\toprule
|
||
\textbf{Vendor} & \textbf{Hardware} & \textbf{Peak FP16} & \textbf{HBM BW} & \textbf{Ridge} \\
|
||
\midrule
|
||
NVIDIA & H100 SXM & 989 TFLOPS & 3.35 TB/s & 295 FLOP/B \\
|
||
NVIDIA & B200 & 2.25 PFLOPS & 8.0 TB/s & 281 FLOP/B \\
|
||
AMD & MI300X & 1,307 TFLOPS & 5.3 TB/s & 247 FLOP/B \\
|
||
Intel & Gaudi\,3 & 1,835 TFLOPS & 3.7 TB/s & 496 FLOP/B \\
|
||
\bottomrule
|
||
\end{tabular}
|
||
\small
|
||
|
||
\vspace{0.3cm}
|
||
\begin{itemize}
|
||
\item Higher ridge $\Rightarrow$ more workloads are memory-bound on this chip
|
||
\item \alert{FLOPS grow faster than bandwidth across GPU generations}
|
||
\item The memory wall is getting \textbf{worse}, not better
|
||
\end{itemize}
|
||
\end{frame}
|
||
|
||
% --- Slide 1.14: Predict Before You Peek #2 ---
|
||
\begin{frame}{Predict: ResNet-50 vs Llama-3 8B}
|
||
\note{[2 min] ``ResNet-50 at batch 256 vs Llama-3 8B at batch 1.
|
||
Which is compute-bound and which is memory-bound?''
|
||
Give 30 seconds. Expected: ResNet at high batch is compute-bound;
|
||
Llama at bs=1 is memory-bound.
|
||
Turn to your neighbor: did you get the same answer? Why or why not? 60 seconds.
|
||
% -- FLEX: [CORE]
|
||
}
|
||
|
||
\centering
|
||
\Large\bfseries
|
||
Two workloads on the same H100.\\[0.3cm]
|
||
\normalsize
|
||
Which is compute-bound? Which is memory-bound?\\[0.5cm]
|
||
|
||
\pause
|
||
|
||
\small
|
||
\begin{tabular}{lcc}
|
||
\toprule
|
||
& \textbf{ResNet-50 (bs=256)} & \textbf{Llama-3 8B (bs=1)} \\
|
||
\midrule
|
||
Total FLOPs & $2.05 \times 10^{12}$ & $1.6 \times 10^{10}$ \\
|
||
Weight bytes & 50 MB (FP16) & 16 GB (FP16) \\
|
||
AI (FLOP/B) & $\sim$41{,}000 & $\sim$1 \\
|
||
\midrule
|
||
\textbf{Regime} & \colorbox{computeblue}{Compute-bound} & \colorbox{datagreen}{Memory-bound} \\
|
||
\bottomrule
|
||
\end{tabular}
|
||
|
||
\vspace{0.3cm}
|
||
\pause
|
||
\alert{Same hardware, completely different bottlenecks.}\\
|
||
\textit{The workload determines the regime, not the GPU.}
|
||
\end{frame}
|
||
|
||
% --- Slide 1.15: Key Takeaway (Part 1) ---
|
||
\begin{frame}{Part 1: Key Takeaway}
|
||
\note{[1 min] Summarize in one sentence. Repeat it twice.
|
||
``The bottleneck determines the speedup. Know your regime.''
|
||
% -- FLEX: [CORE]
|
||
}
|
||
|
||
\centering\Large
|
||
|
||
\textbf{The bottleneck determines the speedup.}\\[0.5cm]
|
||
|
||
\normalsize
|
||
\begin{itemize}
|
||
\item The Roofline model tells you \textit{which} constraint is binding.
|
||
\item Batch size is the primary knob that moves you between regimes.
|
||
\item More FLOPS only helps if you are compute-bound.
|
||
\item More bandwidth only helps if you are memory-bound.
|
||
\item \texttt{Engine.solve()} answers this in one line.
|
||
\end{itemize}
|
||
\end{frame}
|
||
|
||
% --- Roadmap: After Break ---
|
||
\begin{frame}{Roadmap: You Are Here}
|
||
\note{[1 min] Quick orientation after break. We now move from single-op analysis to serving.}
|
||
|
||
\centering\small
|
||
\begin{tabular}{rll}
|
||
\toprule
|
||
\textbf{Time} & \textbf{Part} & \textbf{Status} \\
|
||
\midrule
|
||
9:00--9:30 & Part 0: Welcome \& Setup & \checkmark \\
|
||
9:30--10:30 & Part 1: Iron Law \& Roofline & \checkmark \\
|
||
\rowcolor{crimson!12}
|
||
10:45--11:45 & \textbf{Part 2: Memory Walls \& Serving} & \textbf{$\leftarrow$ You are here} \\
|
||
11:45--12:00 & Part 3: Compression & \\
|
||
\bottomrule
|
||
\end{tabular}
|
||
\end{frame}
|
||
|
||
% =============================================================================
|
||
% PART 2: MEMORY WALLS & SERVING (12 slides)
|
||
% =============================================================================
|
||
\section{Memory Walls \& Serving}
|
||
|
||
% --- Slide 2.1: Key Question ---
|
||
\begin{frame}{Key Question}
|
||
\note{[1 min] ``You have seen that LLM inference at batch 1 is
|
||
memory-bound. But serving is more complex than a single forward
|
||
pass. What makes LLM serving fundamentally different from
|
||
CNN inference?''
|
||
% -- FLEX: [CORE]
|
||
}
|
||
|
||
\centering
|
||
\Large\bfseries
|
||
Why does the first token take 50\,ms\\[0.2cm]
|
||
but each next token only takes 5\,ms?
|
||
\end{frame}
|
||
|
||
% --- Slide 2.2: Two Phases of LLM Serving ---
|
||
\begin{frame}{Wall 4: Prefill vs Decode --- Two Different Physics}
|
||
\note{[3 min] ``Prefill is like reading a book fast (compute-intensive).
|
||
Decode is like looking up one word at a time in a dictionary
|
||
(memory-intensive). Same model, different bottlenecks.''
|
||
% -- FLEX: [CORE]
|
||
}
|
||
|
||
\small
|
||
\wallbox{The Serving Wall}{
|
||
\begin{tabular}{lcl}
|
||
\textbf{TTFT} (Prefill) & $=$ & $\dfrac{\text{Prefill FLOPs}}{\text{Peak FLOPS} \times \text{MFU}}$
|
||
\quad\colorbox{computeblue}{Compute-bound} \\[10pt]
|
||
\textbf{ITL} (Decode) & $=$ & $\dfrac{\text{Weight Bytes}}{\text{Bandwidth}}$
|
||
\quad\colorbox{datagreen}{Memory-bound}
|
||
\end{tabular}
|
||
}
|
||
|
||
\vspace{0.3cm}
|
||
\begin{columns}[T]
|
||
\begin{column}{0.48\textwidth}
|
||
\textbf{Prefill} (process the prompt)
|
||
\begin{itemize}\setlength\itemsep{2pt}
|
||
\item All prompt tokens in parallel
|
||
\item $O(S^2)$ attention + $O(S \cdot P)$ linear
|
||
\item Compute-bound (high AI)
|
||
\item Determines \textbf{TTFT}
|
||
\end{itemize}
|
||
\end{column}
|
||
\begin{column}{0.48\textwidth}
|
||
\textbf{Decode} (generate tokens)
|
||
\begin{itemize}\setlength\itemsep{2pt}
|
||
\item One token at a time
|
||
\item Must reload all weights per token
|
||
\item Memory-bound (AI $\approx$ 1)
|
||
\item Determines \textbf{ITL}
|
||
\end{itemize}
|
||
\end{column}
|
||
\end{columns}
|
||
\end{frame}
|
||
|
||
% --- Slide 2.3: KV Cache --- The Hidden Consumer ---
|
||
\begin{frame}{Wall 5: The KV Cache --- Hidden Memory Consumer}
|
||
\note{[3 min] ``Each active request carries its own memory of the
|
||
conversation.'' Quick math: how much KV cache does one Llama-3 8B
|
||
request at 4K context need in FP16?
|
||
Expected: 2 * 32 * 32 * 128 * 4096 * 2 bytes = ~2 GB.
|
||
% -- FLEX: [CORE]
|
||
}
|
||
|
||
\small
|
||
\wallbox{The Batching Wall}{
|
||
\[
|
||
\text{KV cache} = 2 \times L \times H \times d \times S \times B \times \text{bpp}
|
||
\]
|
||
}
|
||
|
||
\vspace{0.2cm}
|
||
\textbf{Llama-3 8B at 4K context, FP16:}
|
||
\begin{itemize}
|
||
\item $2 \times 32 \times 32 \times 128 \times 4096 \times 2\;\text{bytes} \approx 2\;\text{GB}$ per request
|
||
\item H100 has 80 GB HBM --- model weights take 16 GB
|
||
\item Remaining 64 GB $\div$ 2 GB/request $=$ \textbf{$\sim$32 concurrent requests}
|
||
\end{itemize}
|
||
|
||
\vspace{0.2cm}
|
||
\begin{alertblock}{The serving paradox}
|
||
You want high batch size (for throughput) but KV cache limits how
|
||
many requests fit in memory. \alert{Memory capacity, not compute, limits concurrency.}
|
||
\end{alertblock}
|
||
\end{frame}
|
||
|
||
% --- Slide 2.4: Live Demo --- ServingModel ---
|
||
\begin{frame}[fragile]{Live Demo: Two-Phase Serving Analysis}
|
||
\note{[2 min] Run ServingModel live. Point out TTFT vs ITL in output.
|
||
Show that TTFT is compute-bound and ITL is memory-bound.
|
||
% -- FLEX: [CORE]
|
||
}
|
||
|
||
\small
|
||
\begin{lstlisting}
|
||
serving = mlsysim.ServingModel()
|
||
result = serving.solve(llama, hw,
|
||
seq_len=4096, batch_size=1)
|
||
print(f"TTFT: {result.ttft:~P}")
|
||
print(f"ITL: {result.itl:~P}")
|
||
print(f"KV cache/req: {result.kv_cache_per_request:~P}")
|
||
\end{lstlisting}
|
||
|
||
\vspace{0.2cm}
|
||
\begin{exampleblock}{Expected output}
|
||
TTFT $\approx$ 20--50 ms (compute-bound),
|
||
ITL $\approx$ 5 ms (memory-bound),
|
||
KV cache per request $\approx$ 2 GB
|
||
\end{exampleblock}
|
||
\end{frame}
|
||
|
||
% --- Slide 2.5: Continuous Batching ---
|
||
\begin{frame}{Continuous Batching: Don't Wait, Serve}
|
||
\note{[2 min] ``In static batching, the GPU waits for the longest
|
||
request to finish. In continuous batching, new requests start
|
||
as soon as any slot frees up. Throughput can improve 2--5x.''
|
||
% -- FLEX: [OPTIONAL] Can summarize quickly if behind schedule.
|
||
}
|
||
|
||
\small
|
||
\begin{columns}[T]
|
||
\begin{column}{0.48\textwidth}
|
||
\textbf{Static batching}
|
||
\begin{itemize}\setlength\itemsep{2pt}
|
||
\item Pad all sequences to max length
|
||
\item GPU idle while short requests finish
|
||
\item Throughput limited by longest request
|
||
\item Simple to implement
|
||
\end{itemize}
|
||
\end{column}
|
||
\begin{column}{0.48\textwidth}
|
||
\textbf{Continuous batching}
|
||
\begin{itemize}\setlength\itemsep{2pt}
|
||
\item Insert new requests per iteration
|
||
\item No padding waste
|
||
\item Throughput 2--5$\times$ higher
|
||
\item Used by vLLM, TGI, TensorRT-LLM
|
||
\end{itemize}
|
||
\end{column}
|
||
\end{columns}
|
||
|
||
\vspace{0.3cm}
|
||
\centering
|
||
\begin{tikzpicture}[scale=0.65, >=Stealth]
|
||
% Static
|
||
\node[font=\scriptsize\bfseries] at (0, 2.5) {Static};
|
||
\foreach \i/\len in {0/4, 1/2, 2/3} {
|
||
\fill[computeblue, draw=computestroke] (0.5, \i*0.7) rectangle ({0.5 + \len*0.5}, \i*0.7+0.5);
|
||
\fill[errorfill, draw=errorstroke, opacity=0.5] ({0.5 + \len*0.5}, \i*0.7) rectangle (2.5, \i*0.7+0.5);
|
||
}
|
||
\node[font=\tiny, errorstroke] at (2.8, 0.7) {waste};
|
||
|
||
% Continuous
|
||
\node[font=\scriptsize\bfseries] at (5, 2.5) {Continuous};
|
||
\fill[computeblue, draw=computestroke] (5.5, 1.4) rectangle (7.5, 1.9);
|
||
\fill[datagreen, draw=datastroke] (5.5, 0.7) rectangle (6.5, 1.2);
|
||
\fill[routingorange, draw=routingstroke] (6.7, 0.7) rectangle (7.5, 1.2);
|
||
\fill[computeblue, draw=computestroke] (5.5, 0) rectangle (7, 0.5);
|
||
\fill[datagreen, draw=datastroke] (7.2, 0) rectangle (7.5, 0.5);
|
||
\node[font=\tiny, datastroke] at (8, 0.7) {no waste};
|
||
\end{tikzpicture}
|
||
\end{frame}
|
||
|
||
% --- Slide 2.6: PagedAttention ---
|
||
\begin{frame}[fragile]{PagedAttention: Virtual Memory for KV Cache}
|
||
\note{[2 min] ``Just like OS virtual memory pages physical RAM,
|
||
PagedAttention pages the KV cache. Non-contiguous blocks mean
|
||
no fragmentation, so you can fit more requests.''
|
||
% -- FLEX: [OPTIONAL]
|
||
}
|
||
|
||
\small
|
||
\textbf{The problem:} Pre-allocated KV cache wastes memory on short sequences.
|
||
|
||
\textbf{The solution} (Kwon et al., 2023 --- vLLM):
|
||
\begin{itemize}
|
||
\item Divide KV cache into fixed-size \textbf{pages} (e.g., 16 tokens each)
|
||
\item Allocate pages on demand, not up front
|
||
\item Non-contiguous storage eliminates fragmentation
|
||
\item Memory utilization improves from $\sim$50\% to $>$95\%
|
||
\end{itemize}
|
||
|
||
\vspace{0.2cm}
|
||
\begin{exampleblock}{Impact}
|
||
With the same 80 GB H100, PagedAttention can serve
|
||
\textbf{2--4$\times$ more concurrent requests} than static allocation.
|
||
\end{exampleblock}
|
||
|
||
\vspace{0.2cm}
|
||
\begin{lstlisting}
|
||
# mlsysim models this with ContinuousBatchingModel
|
||
cb = mlsysim.ContinuousBatchingModel()
|
||
result = cb.solve(model, hw, seq_len=4096,
|
||
max_batch_size=32, page_size=16)
|
||
print(f"Max concurrent: {result.max_concurrent_requests}")
|
||
\end{lstlisting}
|
||
\end{frame}
|
||
|
||
% --- Slide 2.7: Predict Before You Peek #3 ---
|
||
\begin{frame}{Predict: How Many Concurrent Requests?}
|
||
\note{[2 min] Predict-before-reveal. Give 60 seconds.
|
||
H100 80 GB, Llama-3 8B FP16, 4K context. How many concurrent requests?
|
||
Expected: weights = 16 GB, remaining = 64 GB.
|
||
PagedAttention (95\% util): ~30. Without (50\% util): ~16.
|
||
Turn to your neighbor: did you get the same answer? Why or why not? 60 seconds.
|
||
% -- FLEX: [CORE]
|
||
}
|
||
|
||
\centering
|
||
\Large\bfseries
|
||
H100 (80 GB), Llama-3 8B (FP16), 4K context.\\[0.5cm]
|
||
\normalsize
|
||
How many concurrent requests can you serve?\\[0.3cm]
|
||
|
||
\pause
|
||
|
||
\small
|
||
\begin{tabular}{lcc}
|
||
\toprule
|
||
& \textbf{Static alloc.} & \textbf{PagedAttention} \\
|
||
\midrule
|
||
KV cache utilization & $\sim$50\% & $\sim$95\% \\
|
||
Effective memory/req & $\sim$4 GB & $\sim$2.1 GB \\
|
||
Max concurrent & $\sim$16 & $\sim$30 \\
|
||
\bottomrule
|
||
\end{tabular}
|
||
|
||
\vspace{0.3cm}
|
||
\pause
|
||
\alert{PagedAttention nearly doubles serving capacity without changing hardware.}
|
||
\end{frame}
|
||
|
||
% --- Slide 2.8: Speculative Decoding ---
|
||
\begin{frame}[fragile]{Speculative Decoding: Betting on the Draft}
|
||
\note{[2 min] ``Use a small draft model to guess the next K tokens.
|
||
Then verify all K in a single forward pass of the big model.
|
||
If the draft is right 70\% of the time and K=5, you effectively
|
||
decode ~3.5 tokens per forward pass instead of 1.''
|
||
% -- FLEX: [OPTIONAL]
|
||
}
|
||
|
||
\small
|
||
\textbf{Insight:} Decode is memory-bound $\Rightarrow$ the GPU has spare compute.
|
||
|
||
\textbf{Speculative decoding} (Leviathan et al., 2023):
|
||
\begin{enumerate}\setlength\itemsep{2pt}
|
||
\item \textbf{Draft} $K$ tokens with a small model (fast, low quality)
|
||
\item \textbf{Verify} all $K$ tokens in one forward pass of the big model
|
||
\item \textbf{Accept} the longest prefix that matches
|
||
\item Speedup $\approx K \times \alpha$ where $\alpha$ is acceptance rate
|
||
\end{enumerate}
|
||
|
||
\vspace{0.2cm}
|
||
\begin{lstlisting}
|
||
# mlsysim supports speculative decoding
|
||
draft = mlsysim.Models.Language.Llama3_8B # smaller
|
||
result = serving.solve(model, hw, seq_len=4096,
|
||
draft_model=draft, draft_acceptance_rate=0.7)
|
||
print(f"Speedup: {result.speculative_speedup:.2f}x")
|
||
\end{lstlisting}
|
||
\end{frame}
|
||
|
||
% --- Slide 2.9: Disaggregated Serving ---
|
||
\begin{frame}[fragile]{Disaggregated Serving: Right Hardware for Each Phase}
|
||
\note{[2 min] ``Split prefill and decode onto different node types.
|
||
Prefill nodes optimize for FLOPS, decode nodes optimize for
|
||
bandwidth. Transfer KV cache over the network between them.''
|
||
% -- FLEX: [OPTIONAL]
|
||
}
|
||
|
||
\small
|
||
\textbf{Key insight:} Prefill and decode have \textit{opposite} hardware preferences.
|
||
|
||
\begin{columns}[T]
|
||
\begin{column}{0.48\textwidth}
|
||
\textbf{Prefill node}
|
||
\begin{itemize}\setlength\itemsep{2pt}
|
||
\item Needs high FLOPS
|
||
\item Moderate memory
|
||
\item E.g., H100 at high utilization
|
||
\end{itemize}
|
||
\end{column}
|
||
\begin{column}{0.48\textwidth}
|
||
\textbf{Decode node}
|
||
\begin{itemize}\setlength\itemsep{2pt}
|
||
\item Needs high BW
|
||
\item Large memory (for KV cache)
|
||
\item E.g., many smaller accelerators
|
||
\end{itemize}
|
||
\end{column}
|
||
\end{columns}
|
||
|
||
\vspace{0.3cm}
|
||
\begin{lstlisting}
|
||
# Disaggregated serving in mlsysim
|
||
result = serving.solve(model, hw, seq_len=4096,
|
||
decode_hardware=mlsysim.Hardware.Cloud.A100)
|
||
print(f"TTFT: {result.ttft:~P} ITL: {result.itl:~P}")
|
||
\end{lstlisting}
|
||
|
||
\vspace{0.2cm}
|
||
\alert{Trade-off: network transfer of KV cache adds latency between phases.}
|
||
\end{frame}
|
||
|
||
% --- Slide 2.10: Exercise 2 ---
|
||
\begin{frame}[fragile]{Exercise 2: Serving Capacity Planning}
|
||
\note{[5 min] Attendees work in pairs.
|
||
How many concurrent Llama-3 8B requests (4K context) can an H100
|
||
serve while maintaining ITL < 10ms?
|
||
Expected: ~30 with PagedAttention at FP16. Binding: memory capacity.
|
||
Turn to your neighbor: did you get the same answer? Why or why not? 60 seconds.
|
||
% -- FLEX: [CORE]
|
||
}
|
||
|
||
\centering
|
||
\Large\bfseries
|
||
Hands-On Exercise\\[0.5cm]
|
||
\normalsize
|
||
|
||
\textbf{Question:} You run Llama-3 8B (FP16) on one H100 with 4K context.\\
|
||
Your SLA requires ITL $<$ 10 ms.\\
|
||
How many concurrent requests can you serve?
|
||
|
||
\vspace{0.3cm}
|
||
\begin{lstlisting}
|
||
cb = mlsysim.ContinuousBatchingModel()
|
||
result = cb.solve(llama, hw, seq_len=4096,
|
||
max_batch_size=64, page_size=16)
|
||
print(f"Max concurrent: {result.max_concurrent_requests}")
|
||
\end{lstlisting}
|
||
|
||
\vspace{0.2cm}
|
||
\textit{Bonus: What happens if you switch to INT8 precision?
|
||
Does concurrency double?}
|
||
\end{frame}
|
||
|
||
% --- Slide 2.11: Fallacies ---
|
||
\begin{frame}{Fallacies: Serving Edition}
|
||
\note{[2 min] Walk through each fallacy with the quantitative
|
||
counter-evidence.
|
||
% -- FLEX: [OPTIONAL] --- can trim to 2 fallacies if behind.
|
||
}
|
||
|
||
\small
|
||
\textbf{Fallacy:} \textit{``Faster GPUs always reduce latency.''}\\
|
||
A 3.2$\times$ FLOPS improvement (A100 $\to$ H100) yields only
|
||
1.7$\times$ ITL improvement because decode is memory-bound.
|
||
|
||
\vspace{0.3cm}
|
||
\textbf{Fallacy:} \textit{``Doubling memory doubles serving capacity.''}\\
|
||
Weights are fixed overhead. Going from 80 GB to 160 GB adds 80 GB,
|
||
but KV cache per request stays $\sim$2 GB. Capacity goes from $\sim$32
|
||
to $\sim$72 ($\sim$2.3$\times$), not 2$\times$.
|
||
|
||
\vspace{0.3cm}
|
||
\textbf{Fallacy:} \textit{``Batch size 1 is fine for LLM inference.''}\\
|
||
At bs=1, MFU $<$ 5\%. You are paying for 989 TFLOPS and using $<$ 50.
|
||
Continuous batching can recover 10--20$\times$ throughput.
|
||
\end{frame}
|
||
|
||
% --- Slide 2.12: Part 2 Key Takeaway ---
|
||
\begin{frame}{Part 2: Key Takeaway}
|
||
\note{[1 min] One sentence summary, repeat twice.
|
||
% -- FLEX: [CORE]
|
||
}
|
||
|
||
\centering\Large
|
||
|
||
\textbf{LLM serving has two phases\\with opposite bottlenecks.}\\[0.5cm]
|
||
|
||
\normalsize
|
||
\begin{itemize}
|
||
\item \textbf{Prefill} (TTFT) is compute-bound --- optimize with parallelism.
|
||
\item \textbf{Decode} (ITL) is memory-bound --- optimize with bandwidth.
|
||
\item \textbf{KV cache} limits concurrency --- optimize with PagedAttention.
|
||
\item \texttt{ServingModel.solve()} decomposes both phases in one call.
|
||
\end{itemize}
|
||
\end{frame}
|
||
|
||
% =============================================================================
|
||
% PART 3: COMPRESSION & EFFICIENCY (8 slides)
|
||
% =============================================================================
|
||
\section{Compression \& Efficiency}
|
||
|
||
% --- Slide 3.1: Key Question ---
|
||
\begin{frame}{Key Question}
|
||
\note{[1 min] ``If we can shrink the model, we move less data,
|
||
and the memory wall recedes. But there is a catch.''
|
||
% -- FLEX: [CORE]
|
||
}
|
||
|
||
\centering
|
||
\Large\bfseries
|
||
Can you make the model 4$\times$ smaller\\[0.2cm]
|
||
and get a 4$\times$ speedup?
|
||
\end{frame}
|
||
|
||
% --- Slide 3.2: Wall 13 --- Compression ---
|
||
\begin{frame}{Wall 13: The Fidelity Wall}
|
||
\note{[2 min] ``Storage always shrinks. But inference speedup
|
||
depends on the method. This distinction trips up everyone.''
|
||
% -- FLEX: [CORE]
|
||
}
|
||
|
||
\small
|
||
\wallbox{The Fidelity Wall}{
|
||
\[
|
||
\text{Compression}_{\text{quant}} = \frac{32}{\text{bits}},
|
||
\qquad
|
||
\text{Compression}_{\text{prune}} = \frac{1}{1 - \text{sparsity}}
|
||
\]
|
||
}
|
||
|
||
\vspace{0.3cm}
|
||
\begin{tabular}{lccc}
|
||
\toprule
|
||
\textbf{Method} & \textbf{Storage} & \textbf{Speedup} & \textbf{Accuracy} \\
|
||
\midrule
|
||
FP32 $\to$ FP16 & 2$\times$ & 2$\times$ & $\sim$0\% loss \\
|
||
FP16 $\to$ INT8 & 2$\times$ & 1.5--2$\times$ & $<$1\% loss \\
|
||
FP16 $\to$ INT4 & 4$\times$ & 2--3$\times$ & 2--5\% loss \\
|
||
50\% unstructured prune & 2$\times$ & \alert{1$\times$ (no speedup!)} & 1--3\% loss \\
|
||
50\% structured prune & 2$\times$ & $\sim$2$\times$ & 2--5\% loss \\
|
||
2:4 N:M sparsity & 2$\times$ & 2$\times$ & 1--2\% loss \\
|
||
\bottomrule
|
||
\end{tabular}
|
||
|
||
\vspace{0.2cm}
|
||
\alert{Unstructured pruning saves storage but gives zero GPU speedup.\\
|
||
Only structured patterns accelerate hardware execution.}
|
||
\end{frame}
|
||
|
||
% --- Slide 3.3: Quantization Deep Dive ---
|
||
\begin{frame}{Quantization: Trading Bits for Speed}
|
||
\note{[2 min] Walk through the precision ladder.
|
||
``If you quantize Llama-3 8B from FP16 to INT4, how much memory?''
|
||
Expected: 8B * 0.5 bytes = 4 GB. Down from 16 GB.
|
||
% -- FLEX: [CORE]
|
||
}
|
||
|
||
\small
|
||
\begin{columns}[T]
|
||
\begin{column}{0.55\textwidth}
|
||
\textbf{The precision ladder:}
|
||
|
||
\vspace{0.2cm}
|
||
\begin{tikzpicture}[scale=0.8, >=Stealth]
|
||
\foreach \i/\label/\bits/\color in {
|
||
0/FP32/32/errorfill,
|
||
1/FP16\slash BF16/16/routingorange,
|
||
2/INT8\slash FP8/8/computeblue,
|
||
3/INT4/4/datagreen} {
|
||
\fill[\color, draw=midgray] (0, 3-\i) rectangle ({0.15*\bits}, 3.6-\i);
|
||
\node[right, font=\footnotesize] at ({0.15*\bits + 0.2}, 3.3-\i) {\label};
|
||
}
|
||
\draw[->, thick] (0, -0.5) -- (5.5, -0.5) node[right, font=\scriptsize] {Size};
|
||
\node[font=\scriptsize, midgray] at (2.5, -1) {$\longleftarrow$ smaller is better};
|
||
\end{tikzpicture}
|
||
\end{column}
|
||
\begin{column}{0.42\textwidth}
|
||
\textbf{Llama-3 8B memory:}
|
||
\scriptsize
|
||
\begin{tabular}{lc}
|
||
\toprule
|
||
Precision & Weight Size \\
|
||
\midrule
|
||
FP32 & 32 GB \\
|
||
FP16 & 16 GB \\
|
||
INT8 & 8 GB \\
|
||
INT4 & 4 GB \\
|
||
\bottomrule
|
||
\end{tabular}
|
||
|
||
\vspace{0.3cm}
|
||
\normalsize
|
||
At INT4, Llama-3 8B fits in\\
|
||
a \textbf{laptop GPU} (6 GB).
|
||
\end{column}
|
||
\end{columns}
|
||
\end{frame}
|
||
|
||
% --- Slide 3.4: Live Demo --- CompressionModel ---
|
||
\begin{frame}[fragile]{Live Demo: Quantization Impact}
|
||
\note{[2 min] Run CompressionModel live. Show storage savings
|
||
and speedup side by side.
|
||
% -- FLEX: [CORE]
|
||
}
|
||
|
||
\small
|
||
\begin{lstlisting}
|
||
comp = mlsysim.CompressionModel()
|
||
for bits in [16, 8, 4]:
|
||
r = comp.solve(llama, hw, method="quantization",
|
||
target_bitwidth=bits)
|
||
print(f"INT{bits}: size={r.compressed_size:~P} "
|
||
f"speedup={r.inference_speedup:.1f}x")
|
||
\end{lstlisting}
|
||
|
||
\vspace{0.2cm}
|
||
\begin{exampleblock}{What to observe}
|
||
\begin{itemize}
|
||
\item Storage shrinks linearly with bit reduction
|
||
\item Speedup follows storage for quantization (structured by nature)
|
||
\item Accuracy degrades modestly at INT8, more at INT4
|
||
\end{itemize}
|
||
\end{exampleblock}
|
||
\end{frame}
|
||
|
||
% --- Slide 3.5: Structured vs Unstructured Pruning ---
|
||
\begin{frame}[fragile]{Pruning: The Structure Matters}
|
||
\note{[2 min] ``Unstructured pruning zeros out individual weights.
|
||
The matrix is still the same shape, so the GPU does the same
|
||
number of operations. No speedup! Structured pruning removes
|
||
entire rows/columns, physically shrinking the matrix.''
|
||
WARN: Students assume any compression = speedup.
|
||
% -- FLEX: [CORE]
|
||
}
|
||
|
||
\small
|
||
\begin{columns}[T]
|
||
\begin{column}{0.48\textwidth}
|
||
\textbf{Unstructured}
|
||
\begin{itemize}\setlength\itemsep{2pt}
|
||
\item Zero out individual weights
|
||
\item Matrix shape unchanged
|
||
\item GPU does same work (skip zeros? nope)
|
||
\item \alert{Storage savings only}
|
||
\end{itemize}
|
||
\end{column}
|
||
\begin{column}{0.48\textwidth}
|
||
\textbf{Structured / N:M}
|
||
\begin{itemize}\setlength\itemsep{2pt}
|
||
\item Remove entire rows/columns, or\\
|
||
2:4 pattern (Ampere+)
|
||
\item Physically smaller matrices
|
||
\item GPU hardware support (2:4 $\to$ 2$\times$)
|
||
\item \textbf{Real speedup}
|
||
\end{itemize}
|
||
\end{column}
|
||
\end{columns}
|
||
|
||
\vspace{0.3cm}
|
||
\begin{lstlisting}
|
||
for stype in ["unstructured", "structured", "n_m"]:
|
||
r = comp.solve(llama, hw, method="pruning",
|
||
sparsity=0.5, sparsity_type=stype)
|
||
print(f"{stype:>14}: {r.inference_speedup:.1f}x")
|
||
\end{lstlisting}
|
||
\end{frame}
|
||
|
||
% --- Slide 3.5b: Predict --- What Does INT4 Change? ---
|
||
\begin{frame}{Predict: What Does INT4 Change?}
|
||
\note{[1 min] Quick poll. Most say "latency gets better." The real answer is fleet size.
|
||
Turn to your neighbor: did you get the same answer? Why or why not? 60 seconds.}
|
||
\centering
|
||
\Large
|
||
\textbf{You quantize Llama-3 70B from FP16 to INT4.}\\[0.5cm]
|
||
\normalsize
|
||
What is the BIGGEST impact on your serving infrastructure?\\[0.3cm]
|
||
\begin{enumerate}[(A)]
|
||
\item Inference latency drops by 4$\times$
|
||
\item Model quality degrades significantly
|
||
\item \textbf{You need half as many GPUs}
|
||
\item Memory bandwidth becomes the bottleneck
|
||
\end{enumerate}
|
||
\end{frame}
|
||
|
||
% --- Slide 3.6: Predict Before You Peek #4 ---
|
||
\begin{frame}{Predict: INT4 Llama-3 8B on H100}
|
||
\note{[2 min] Predict-before-reveal. If you quantize to INT4 at bs=1,
|
||
is it still memory-bound? Yes! Load time = 4/3350 = 1.2 ms.
|
||
Compute still ~0.03 ms. Still memory-bound but 4x faster.
|
||
Turn to your neighbor: did you get the same answer? Why or why not? 60 seconds.
|
||
% -- FLEX: [CORE]
|
||
}
|
||
|
||
\centering
|
||
\Large\bfseries
|
||
Quantize Llama-3 8B to INT4.\\
|
||
Run inference at batch size 1 on H100.\\[0.5cm]
|
||
\normalsize
|
||
Is it still memory-bound?\\[0.3cm]
|
||
|
||
\pause
|
||
|
||
\small
|
||
\begin{tabular}{lcc}
|
||
\toprule
|
||
& \textbf{FP16} & \textbf{INT4} \\
|
||
\midrule
|
||
Weight size & 16 GB & 4 GB \\
|
||
Load time & 4.8 ms & 1.2 ms \\
|
||
Compute time & 0.03 ms & 0.03 ms \\
|
||
\midrule
|
||
Bottleneck & Memory & \textbf{Still Memory!} \\
|
||
Decode speedup & --- & \textbf{4$\times$} \\
|
||
\bottomrule
|
||
\end{tabular}
|
||
|
||
\vspace{0.3cm}
|
||
\pause
|
||
\alert{INT4 gives 4$\times$ faster decode, but the GPU is still memory-bound.\\
|
||
The memory wall is that deep.}
|
||
\end{frame}
|
||
|
||
% --- Slide 3.7: Exercise 3 ---
|
||
\begin{frame}[fragile]{Exercise 3: Compression Tradeoffs}
|
||
\note{[5 min] Compare INT8 quantization vs 50\% structured pruning
|
||
for Llama-3 8B on H100. Which gives better speedup per accuracy loss?
|
||
Expected: INT8 wins (~2x speedup, <1\% loss vs 2-5\% loss).
|
||
Turn to your neighbor: did you get the same answer? Why or why not? 60 seconds.
|
||
% -- FLEX: [CORE]
|
||
}
|
||
|
||
\centering
|
||
\Large\bfseries
|
||
Hands-On Exercise\\[0.5cm]
|
||
\normalsize
|
||
|
||
\textbf{Question:} For Llama-3 8B on H100, which is the better deal?
|
||
\begin{enumerate}
|
||
\item INT8 quantization
|
||
\item 50\% structured pruning
|
||
\end{enumerate}
|
||
Compare speedup per accuracy point lost.
|
||
|
||
\vspace{0.3cm}
|
||
\begin{lstlisting}
|
||
r1 = comp.solve(llama, hw, method="quantization",
|
||
target_bitwidth=8)
|
||
r2 = comp.solve(llama, hw, method="pruning",
|
||
sparsity=0.5, sparsity_type="structured")
|
||
print(f"Quant: {r1.inference_speedup:.1f}x / "
|
||
f"{abs(r1.accuracy_delta):.1%} loss")
|
||
print(f"Prune: {r2.inference_speedup:.1f}x / "
|
||
f"{abs(r2.accuracy_delta):.1%} loss")
|
||
\end{lstlisting}
|
||
\end{frame}
|
||
|
||
% --- Slide 3.7b: Compression Changes Fleet Architecture ---
|
||
\begin{frame}{Compression Changes Fleet Architecture}
|
||
\note{[3 min] This is the ``aha'' that compression is architecture, not optimization.
|
||
The punchline: INT4 halves your GPU count AND your electricity bill.}
|
||
|
||
\small
|
||
\textbf{Llama-3 70B Serving Fleet:}
|
||
|
||
\vspace{0.3cm}
|
||
\begin{tabular}{@{}lrrr@{}}
|
||
\toprule
|
||
Precision & Model Size & GPUs Needed & Annual Cost \\
|
||
\midrule
|
||
FP16 & 140 GB & 4 (TP=4) & \$480K \\
|
||
INT8 & 70 GB & 2 (TP=2) & \$240K \\
|
||
INT4 & 35 GB & 1 & \$120K \\
|
||
\bottomrule
|
||
\end{tabular}
|
||
|
||
\vspace{0.3cm}
|
||
\textbf{INT4 doesn't just improve latency --- it eliminates 3 GPUs per replica.}\\
|
||
At 100 replicas for 1000 QPS: that's \textbf{300 fewer GPUs} and \textbf{\$36M saved per year}.
|
||
|
||
\vfill
|
||
\centering
|
||
\small\textcolor{gray}{This is why quantization is a Day 1 architectural decision, not a Day 100 optimization.}
|
||
\end{frame}
|
||
|
||
% --- Slide 3.8: Part 3 Key Takeaway ---
|
||
\begin{frame}{Part 3: Key Takeaway}
|
||
\note{[1 min] One sentence. Repeat.
|
||
% -- FLEX: [CORE]
|
||
}
|
||
|
||
\centering\Large
|
||
|
||
\textbf{Storage savings $\neq$ inference speedup.}\\[0.5cm]
|
||
|
||
\normalsize
|
||
\begin{itemize}
|
||
\item Quantization gives both storage and speed gains.
|
||
\item Unstructured pruning gives storage only --- zero GPU speedup.
|
||
\item N:M sparsity (2:4) is the hardware-friendly middle ground.
|
||
\item Even at INT4, LLM decode is \textit{still} memory-bound.
|
||
\item \texttt{CompressionModel.solve()} quantifies the full tradeoff.
|
||
\end{itemize}
|
||
\end{frame}
|
||
|
||
% --- Roadmap: After Lunch ---
|
||
\begin{frame}{Roadmap: Afternoon Session}
|
||
\note{[1 min] Re-energize the room. ``Welcome back. The morning was about
|
||
single-node physics. The afternoon is about fleets, money, and carbon.''}
|
||
|
||
\centering\small
|
||
\begin{tabular}{rll}
|
||
\toprule
|
||
\textbf{Time} & \textbf{Part} & \textbf{Status} \\
|
||
\midrule
|
||
9:00--12:00 & Parts 0--3: Single Node & \checkmark Done \\
|
||
\midrule
|
||
\rowcolor{crimson!12}
|
||
1:00--2:15 & \textbf{Part 4: Going Distributed} & \textbf{$\leftarrow$ You are here} \\
|
||
2:30--3:15 & Part 5: Economics \& Sustainability & \\
|
||
3:15--3:45 & Part 6: Design Space Exploration & \\
|
||
3:45--4:15 & Part 7: TinyML to Frontier & \\
|
||
4:15--4:45 & Part 8: Advanced Topics & \\
|
||
4:45--5:00 & Part 9: Wrap-Up \& Capstone & \\
|
||
\bottomrule
|
||
\end{tabular}
|
||
\end{frame}
|
||
|
||
% =============================================================================
|
||
% PART 4: GOING DISTRIBUTED (15 slides)
|
||
% =============================================================================
|
||
\section{Going Distributed}
|
||
|
||
% --- Slide 4.1: Key Question ---
|
||
\begin{frame}{Key Question}
|
||
\note{[1 min] ``Your model does not fit on one GPU. Or it fits but
|
||
training would take a year. Either way, you need more GPUs.
|
||
But adding GPUs is not free.''
|
||
% -- FLEX: [CORE]
|
||
}
|
||
|
||
\centering
|
||
\Large\bfseries
|
||
If 1 GPU takes 30 days,\\[0.2cm]
|
||
do 1000 GPUs take 43 minutes?
|
||
\end{frame}
|
||
|
||
% --- Slide 4.2: Why Distribute? ---
|
||
\begin{frame}{Why Distribute?}
|
||
\note{[2 min] ``Two reasons to go distributed: (1) the model does not
|
||
fit in one GPU's memory, or (2) you want to finish sooner.
|
||
Reason 1 is a hard constraint. Reason 2 is an optimization.''
|
||
% -- FLEX: [CORE]
|
||
}
|
||
|
||
\small
|
||
\textbf{Reason 1: Model does not fit}
|
||
\begin{itemize}
|
||
\item Llama-3 70B FP16 $=$ 140 GB $>$ H100's 80 GB
|
||
\item \alert{Must} split across at least 2 GPUs
|
||
\end{itemize}
|
||
|
||
\vspace{0.3cm}
|
||
\textbf{Reason 2: Time-to-train}
|
||
\begin{itemize}
|
||
\item 1 H100 training Llama-3 70B $\approx$ 15 GPU-years
|
||
\item 1024 H100s $\approx$ 5 days (if scaling were perfect)
|
||
\item But scaling is \textit{never} perfect...
|
||
\end{itemize}
|
||
|
||
\vspace{0.3cm}
|
||
\centering
|
||
\begin{tikzpicture}[scale=0.7, >=Stealth]
|
||
\draw[->, thick] (0,0) -- (7,0) node[right, font=\scriptsize] {GPUs};
|
||
\draw[->, thick] (0,0) -- (0,4) node[above, font=\scriptsize] {Speedup};
|
||
\draw[dashed, midgray] (0,0) -- (6.5,3.9) node[right, font=\scriptsize\itshape] {ideal};
|
||
\draw[very thick, crimson] (0,0) .. controls (2,2) and (4,3) .. (6.5,3.2);
|
||
\node[font=\scriptsize, crimson] at (5.5, 2.4) {reality};
|
||
\end{tikzpicture}
|
||
\end{frame}
|
||
|
||
% --- Slide 4.3: The Three Dimensions of Parallelism ---
|
||
\begin{frame}{3D Parallelism: DP $\times$ TP $\times$ PP}
|
||
\note{[3 min] ``Every distributed strategy is a combination of three
|
||
dimensions.'' WARN: Students often confuse TP and PP.
|
||
TP splits within a layer; PP splits between layers.
|
||
% -- FLEX: [CORE]
|
||
}
|
||
|
||
\small
|
||
\begin{columns}[T]
|
||
\begin{column}{0.32\textwidth}
|
||
\textbf{Data Parallel (DP)}
|
||
\begin{itemize}\setlength\itemsep{1pt}\footnotesize
|
||
\item Replicate full model
|
||
\item Split data across replicas
|
||
\item AllReduce gradients
|
||
\item \textit{Most common}
|
||
\end{itemize}
|
||
\end{column}
|
||
\begin{column}{0.32\textwidth}
|
||
\textbf{Tensor Parallel (TP)}
|
||
\begin{itemize}\setlength\itemsep{1pt}\footnotesize
|
||
\item Split each layer's weights
|
||
\item Split activations, not data
|
||
\item AllReduce per layer (2$\times$!)
|
||
\item Needs fast interconnect
|
||
\end{itemize}
|
||
\end{column}
|
||
\begin{column}{0.32\textwidth}
|
||
\textbf{Pipeline Parallel (PP)}
|
||
\begin{itemize}\setlength\itemsep{1pt}\footnotesize
|
||
\item Split model into stages
|
||
\item Each GPU owns $L/\text{PP}$ layers
|
||
\item Pipeline bubbles
|
||
\item Needs less bandwidth
|
||
\end{itemize}
|
||
\end{column}
|
||
\end{columns}
|
||
|
||
\vspace{0.3cm}
|
||
Total GPUs: $N = \text{DP} \times \text{TP} \times \text{PP}$
|
||
|
||
\vspace{0.2cm}
|
||
\centering
|
||
\scriptsize
|
||
\begin{tabular}{lccc}
|
||
\toprule
|
||
\textbf{Property} & \textbf{DP} & \textbf{TP} & \textbf{PP} \\
|
||
\midrule
|
||
Splits & Data & Weights + Activations & Layers \\
|
||
Communication & AllReduce (gradients) & AllReduce (activations) & Point-to-point \\
|
||
BW requirement & Moderate & Very high (NVLink) & Low \\
|
||
Bubble overhead & None & None & $\sim(P{-}1)/(M{+}P{-}1)$ \\
|
||
\bottomrule
|
||
\end{tabular}
|
||
\end{frame}
|
||
|
||
% --- Slide 4.3b: AllReduce Concrete Example ---
|
||
\begin{frame}[fragile]{AllReduce: A Concrete Example}
|
||
\note{[2 min] NUMBERS FIRST, then formula. Students need to see the magnitude before the algebra.}
|
||
|
||
\small
|
||
\textbf{Setup:} 8 H100 GPUs, NVLink at 900 GB/s, Llama-3 8B (16 GB gradients)
|
||
|
||
\vspace{0.3cm}
|
||
\begin{enumerate}
|
||
\item Each GPU computes its local gradient: \textbf{16 GB}
|
||
\item All 8 GPUs must end up with the \textbf{same averaged gradient}
|
||
\item Ring AllReduce passes chunks around the ring\ldots
|
||
\end{enumerate}
|
||
|
||
\vspace{0.3cm}
|
||
\begin{lstlisting}
|
||
t = mlsysim.core.formulas.calc_ring_allreduce_time(
|
||
message_bytes=16e9,
|
||
n_gpus=8,
|
||
bandwidth_bytes_s=900e9,
|
||
latency_s=500e-9,
|
||
)
|
||
print(f"AllReduce time: {t.to('ms'):.1f}")
|
||
# -> ~35 ms (bandwidth-dominated, latency is negligible)
|
||
\end{lstlisting}
|
||
|
||
\vfill
|
||
\centering
|
||
\textbf{35 ms} to synchronize 8 GPUs. Now: what happens at 256 GPUs?
|
||
\end{frame}
|
||
|
||
% --- Slide 4.4: Data Parallelism + AllReduce ---
|
||
\begin{frame}{Wall 14: The Communication Wall (AllReduce)}
|
||
\note{[3 min] ``Quick: 1 GB of gradients, 8 GPUs, 50 GB/s NVLink.
|
||
How long for AllReduce?'' Expected: 2*(7/8)*1/50 = 35 ms.
|
||
Ring AllReduce sends 2(N-1)/N times the data. As N grows,
|
||
this approaches 2x.
|
||
% -- FLEX: [CORE]
|
||
}
|
||
|
||
\small
|
||
\wallbox{The Communication Wall}{
|
||
\[
|
||
T_{\text{AllReduce}} = \frac{2(N-1)}{N} \times \frac{M}{BW}
|
||
+ 2(N-1) \times \text{latency}
|
||
\]
|
||
}
|
||
|
||
\vspace{0.2cm}
|
||
\textbf{Example:} 1 GB gradients, 8$\times$ H100 on NVLink (900 GB/s)
|
||
|
||
\[
|
||
T = \frac{2 \times 7}{8} \times \frac{1}{900} \approx 1.9\;\text{ms}
|
||
\]
|
||
|
||
\vspace{0.2cm}
|
||
\textbf{Same gradients}, 256$\times$ H100 across InfiniBand (50 GB/s):
|
||
|
||
\[
|
||
T = \frac{2 \times 255}{256} \times \frac{1}{50} \approx 40\;\text{ms}
|
||
\]
|
||
|
||
\vspace{0.2cm}
|
||
\alert{NVLink is 18$\times$ faster than InfiniBand for AllReduce.\\
|
||
That is why TP must stay within a node.}
|
||
\end{frame}
|
||
|
||
% --- Slide 4.5: Tensor Parallelism ---
|
||
\begin{frame}{Tensor Parallelism: Splitting Layers}
|
||
\note{[3 min] ``TP splits each layer's weight matrix across GPUs.
|
||
Every forward and backward pass requires 2 AllReduce ops per
|
||
layer. That is why TP only works on NVLink, not across nodes.''
|
||
% -- FLEX: [CORE]
|
||
}
|
||
|
||
\small
|
||
\textbf{How it works:}
|
||
\begin{enumerate}\setlength\itemsep{2pt}
|
||
\item Split weight matrix $W$ column-wise across $T$ GPUs
|
||
\item Each GPU computes $Y_i = X \cdot W_i$ (partial result)
|
||
\item AllReduce to combine: $Y = \sum Y_i$
|
||
\item \alert{2 AllReduce ops per layer} (forward + backward)
|
||
\end{enumerate}
|
||
|
||
\vspace{0.3cm}
|
||
\textbf{TP overhead:}
|
||
\[
|
||
T_{\text{TP}} = 2 \times L \times T_{\text{AllReduce}}(T)
|
||
\]
|
||
|
||
\begin{exampleblock}{Llama-3 70B, TP=8 on NVLink (900 GB/s)}
|
||
\begin{itemize}
|
||
\item 80 layers $\times$ 2 AllReduce $\times$ $\sim$0.1 ms each $\approx$ \textbf{16 ms overhead per step}
|
||
\item This is 10--20\% of a typical training step
|
||
\end{itemize}
|
||
\end{exampleblock}
|
||
\end{frame}
|
||
|
||
% --- Slide 4.6: Pipeline Parallelism ---
|
||
\begin{frame}{Pipeline Parallelism: The Bubble Problem}
|
||
\note{[3 min] ``With 4 stages and 4 microbatches, what fraction
|
||
of time is wasted?'' Expected: 3/7 = 43\%. With 32 microbatches:
|
||
3/35 = 8.6\%. Lesson: more microbatches = smaller bubble.
|
||
% -- FLEX: [CORE]
|
||
}
|
||
|
||
\small
|
||
\wallbox{Pipeline Bubble Fraction}{
|
||
\[
|
||
\text{Bubble} = \frac{P - 1}{M + P - 1}
|
||
\]
|
||
where $P$ = pipeline stages, $M$ = microbatches
|
||
}
|
||
|
||
\vspace{0.3cm}
|
||
\scriptsize
|
||
\begin{tabular}{lcccc}
|
||
\toprule
|
||
$P$ (stages) & $M$ (microbatches) & Bubble & Effective utilization \\
|
||
\midrule
|
||
4 & 4 & 43\% & 57\% \\
|
||
4 & 16 & 16\% & 84\% \\
|
||
4 & 32 & 8.6\% & 91\% \\
|
||
8 & 32 & 18\% & 82\% \\
|
||
\bottomrule
|
||
\end{tabular}
|
||
|
||
\normalsize
|
||
\vspace{0.2cm}
|
||
\alert{More microbatches $\Rightarrow$ smaller bubble.\\
|
||
But more microbatches = more memory for activations.}
|
||
\end{frame}
|
||
|
||
% --- Slide 4.7: Gradient Accumulation ---
|
||
\begin{frame}{Gradient Accumulation: Virtual Batch Size}
|
||
\note{[2 min] ``Process K small microbatches and accumulate gradients
|
||
before the optimizer step. This fills the pipeline and amortizes AllReduce.''
|
||
% -- FLEX: [OPTIONAL]
|
||
}
|
||
|
||
\small
|
||
\[
|
||
B_{\text{effective}} = B_{\text{micro}} \times K \times \text{DP}
|
||
\]
|
||
|
||
\textbf{Why accumulate?}
|
||
\begin{itemize}\setlength\itemsep{2pt}
|
||
\item Fill the pipeline ($M = K$ microbatches)
|
||
\item Amortize AllReduce cost over $K$ steps
|
||
\item Simulate large batch size without large memory
|
||
\item Trade compute (more forward passes) for communication (fewer AllReduce)
|
||
\end{itemize}
|
||
|
||
\vspace{0.2cm}
|
||
\textbf{Example:} DP=128, $B_{\text{micro}}$=4, $K$=8
|
||
\begin{itemize}
|
||
\item $B_{\text{effective}} = 4 \times 8 \times 128 = 4096$
|
||
\item AllReduce only once per 8 microbatches
|
||
\item Pipeline bubble: $(P-1)/(8+P-1)$ --- much smaller
|
||
\end{itemize}
|
||
\end{frame}
|
||
|
||
% --- Slide 4.8: Hierarchical Communication ---
|
||
\begin{frame}{Hierarchical AllReduce: NVLink + InfiniBand}
|
||
\note{[2 min] ``Hierarchical AllReduce first reduces within each
|
||
node (fast NVLink), then across nodes (slower IB), then
|
||
broadcasts back. This exploits the bandwidth hierarchy.''
|
||
% -- FLEX: [OPTIONAL]
|
||
}
|
||
|
||
\small
|
||
\textbf{Real cluster topology:}
|
||
|
||
\centering
|
||
\begin{tikzpicture}[scale=0.75, >=Stealth,
|
||
gpu/.style={draw, fill=computeblue, rounded corners, minimum width=0.6cm,
|
||
minimum height=0.5cm, font=\tiny},
|
||
node/.style={draw, fill=white, rounded corners=4pt, dashed, inner sep=4pt}]
|
||
|
||
% Node 0
|
||
\node[node, label=above:{\scriptsize Node 0}] (n0) at (0,0) {
|
||
\begin{tikzpicture}
|
||
\foreach \i in {0,...,3} {
|
||
\node[gpu] (g0\i) at (\i*0.8, 0) {G\i};
|
||
}
|
||
\end{tikzpicture}
|
||
};
|
||
|
||
% Node 1
|
||
\node[node, label=above:{\scriptsize Node 1}] (n1) at (5.5,0) {
|
||
\begin{tikzpicture}
|
||
\foreach \i in {0,...,3} {
|
||
\node[gpu] (g1\i) at (\i*0.8, 0) {G\i};
|
||
}
|
||
\end{tikzpicture}
|
||
};
|
||
|
||
% NVLink labels
|
||
\node[font=\tiny, datastroke] at (0, -0.9) {NVLink 900 GB/s};
|
||
\node[font=\tiny, datastroke] at (5.5, -0.9) {NVLink 900 GB/s};
|
||
|
||
% IB link
|
||
\draw[very thick, crimson, <->] (2.2, 0) -- (3.3, 0)
|
||
node[midway, above, font=\tiny] {IB 50 GB/s};
|
||
\end{tikzpicture}
|
||
|
||
\vspace{0.3cm}
|
||
\flushleft
|
||
\small
|
||
\textbf{3-step hierarchical AllReduce:}
|
||
\begin{enumerate}\setlength\itemsep{1pt}
|
||
\item \textbf{Local reduce} within each node (NVLink --- fast)
|
||
\item \textbf{Global AllReduce} across leader GPUs (InfiniBand --- slow)
|
||
\item \textbf{Local broadcast} within each node (NVLink --- fast)
|
||
\end{enumerate}
|
||
|
||
\alert{TP within node (NVLink). DP across nodes (InfiniBand).}
|
||
\end{frame}
|
||
|
||
% --- Slide 4.9: Live Demo --- DistributedModel ---
|
||
\begin{frame}[fragile]{Live Demo: Distributed Training Analysis}
|
||
\note{[3 min] Run DistributedModel live. Show communication overhead,
|
||
bubble fraction, and scaling efficiency.
|
||
% -- FLEX: [CORE]
|
||
}
|
||
|
||
\small
|
||
\begin{lstlisting}
|
||
fleet = mlsysim.Systems.Clusters.Frontier_8K
|
||
dist = mlsysim.DistributedModel()
|
||
result = dist.solve(llama, fleet, batch_size=4096,
|
||
tp_size=8, pp_size=1, microbatch_count=32,
|
||
seq_len=4096)
|
||
print(f"Scaling eff: {result.scaling_efficiency:.1%}")
|
||
print(f"Comm overhead: {result.communication_overhead:.1%}")
|
||
print(f"Effective MFU: {result.effective_mfu:.1%}")
|
||
\end{lstlisting}
|
||
|
||
\vspace{0.2cm}
|
||
\begin{exampleblock}{What to look for}
|
||
Communication overhead + bubble fraction = total efficiency loss.
|
||
Effective MFU $=$ single-node MFU $\times$ scaling efficiency.
|
||
\end{exampleblock}
|
||
\end{frame}
|
||
|
||
% --- Slide 4.10: Wall 15 --- The Fragility Wall ---
|
||
\begin{frame}{Wall 15: The Fragility Wall (Reliability)}
|
||
\note{[2 min] ``If you have 10,000 GPUs each with 50,000 hour MTBF,
|
||
what is the cluster MTBF?'' Expected: 50,000/10,000 = 5 hours.
|
||
This is why checkpointing exists.
|
||
% -- FLEX: [CORE]
|
||
}
|
||
|
||
\small
|
||
\wallbox{The Fragility Wall}{
|
||
\[
|
||
\text{Cluster MTBF} = \frac{\text{Component MTBF}}{N_{\text{components}}}
|
||
\]
|
||
}
|
||
|
||
\vspace{0.2cm}
|
||
\begin{tabular}{lcc}
|
||
\toprule
|
||
\textbf{Scale} & \textbf{GPUs} & \textbf{Cluster MTBF} \\
|
||
\midrule
|
||
Research lab & 8 & 260 days \\
|
||
Mid cluster & 256 & 8 days \\
|
||
Large cluster & 1,024 & 2 days \\
|
||
Frontier-scale & 8,192 & 6 hours \\
|
||
Mega cluster & 100K & 30 minutes \\
|
||
\bottomrule
|
||
\end{tabular}
|
||
|
||
\vspace{0.2cm}
|
||
\alert{At frontier scale, something breaks every 6 hours.\\
|
||
Without checkpointing, every failure wastes the entire run since the last save.}
|
||
\end{frame}
|
||
|
||
% --- Slide 4.10b: Predict --- Scaling to 256 GPUs ---
|
||
\begin{frame}{Predict: Scaling to 256 GPUs}
|
||
\note{[2 min] PREDICTION. Hands up for each answer. Most will say 256x.
|
||
Turn to your neighbor: did you get the same answer? Why or why not? 60 seconds.}
|
||
\centering
|
||
\Large
|
||
\textbf{You have 8 H100s doing data-parallel training.}\\[0.5cm]
|
||
\textbf{You scale to 256 GPUs.}\\[0.5cm]
|
||
\normalsize
|
||
How much faster will training be?\\[0.3cm]
|
||
\begin{enumerate}[(A)]
|
||
\item 32$\times$ faster (perfect scaling)
|
||
\item 20--25$\times$ faster
|
||
\item 10--15$\times$ faster
|
||
\item \textbf{It depends on the model size}
|
||
\end{enumerate}
|
||
\end{frame}
|
||
|
||
% --- Slide 4.11: Scaling Efficiency ---
|
||
\begin{frame}{Scaling Efficiency: The Amdahl Trap}
|
||
\note{[2 min] ``Scaling efficiency is the fraction of ideal speedup
|
||
you actually achieve.'' Includes comm overhead, pipeline
|
||
bubbles, stragglers, and failure recovery.
|
||
% -- FLEX: [CORE]
|
||
}
|
||
|
||
\small
|
||
\[
|
||
\eta_{\text{scaling}} = \frac{\text{Actual speedup}}{N}
|
||
= \frac{1}{1 + \text{comm\_frac} + \text{bubble\_frac} + \text{straggler\_frac}}
|
||
\]
|
||
|
||
\vspace{0.3cm}
|
||
\begin{columns}[T]
|
||
\begin{column}{0.55\textwidth}
|
||
\textbf{What eats scaling efficiency:}
|
||
\begin{enumerate}\setlength\itemsep{2pt}
|
||
\item AllReduce communication
|
||
\item Pipeline bubbles
|
||
\item Straggler effects (slowest GPU)
|
||
\item Checkpoint I/O
|
||
\item Failure recovery
|
||
\end{enumerate}
|
||
\end{column}
|
||
\begin{column}{0.42\textwidth}
|
||
\centering
|
||
\scriptsize
|
||
\begin{tabular}{lc}
|
||
\toprule
|
||
\textbf{System} & $\eta_{\text{scaling}}$ \\
|
||
\midrule
|
||
8 GPUs (NVLink) & 95--98\% \\
|
||
64 GPUs (IB) & 85--92\% \\
|
||
1024 GPUs & 70--85\% \\
|
||
8192 GPUs & 55--70\% \\
|
||
\bottomrule
|
||
\end{tabular}
|
||
\end{column}
|
||
\end{columns}
|
||
|
||
\vspace{0.3cm}
|
||
\alert{At 8192 GPUs, you lose 30--45\% of your compute to overhead.}
|
||
\end{frame}
|
||
|
||
% --- Slide 4.12: Predict Before You Peek #5 ---
|
||
\begin{frame}{Predict: Optimal Parallelism Config}
|
||
\note{[2 min] You have 64 H100s. Llama-3 70B (140 GB FP16).
|
||
What TP x PP x DP? Give 90 seconds.
|
||
Expected: TP=8 (NVLink), PP=1 (no bubbles), DP=8 (64/8).
|
||
Turn to your neighbor: did you get the same answer? Why or why not? 60 seconds.
|
||
% -- FLEX: [CORE]
|
||
}
|
||
|
||
\centering
|
||
\Large\bfseries
|
||
64 H100s. Llama-3 70B (140 GB FP16).\\[0.3cm]
|
||
\normalsize
|
||
What is the optimal TP $\times$ PP $\times$ DP?\\[0.5cm]
|
||
|
||
\pause
|
||
|
||
\small
|
||
\begin{tabular}{lcccl}
|
||
\toprule
|
||
\textbf{Config} & \textbf{TP} & \textbf{PP} & \textbf{DP} & \textbf{Why} \\
|
||
\midrule
|
||
Candidate A & 8 & 1 & 8 & TP within node, no bubbles \\
|
||
Candidate B & 4 & 2 & 8 & Less TP comm, but has bubbles \\
|
||
Candidate C & 2 & 4 & 8 & Minimal TP, large bubble \\
|
||
\bottomrule
|
||
\end{tabular}
|
||
|
||
\vspace{0.3cm}
|
||
\pause
|
||
\alert{Candidate A is typically best:} TP=8 uses full NVLink bandwidth,
|
||
PP=1 avoids pipeline bubbles entirely, DP=8 across nodes.
|
||
|
||
\vspace{0.2cm}
|
||
\textit{Rule of thumb: maximize TP within a node, minimize PP.}
|
||
\end{frame}
|
||
|
||
% --- Slide 4.13: Exercise 4 ---
|
||
\begin{frame}[fragile]{Exercise 4: Distributed Training Design}
|
||
\note{[5 min] Sweep TP in [1,2,4,8] and PP in [1,2,4,8] for
|
||
Llama-3 70B on 64 H100s. Expected: TP=8, PP=1, DP=8.
|
||
Turn to your neighbor: did you get the same answer? Why or why not? 60 seconds.
|
||
% -- FLEX: [CORE]
|
||
}
|
||
|
||
\centering
|
||
\Large\bfseries
|
||
Hands-On Exercise\\[0.5cm]
|
||
\normalsize
|
||
|
||
\textbf{Question:} Find the optimal TP$\times$PP for Llama-3 70B on 64 H100s.
|
||
|
||
\vspace{0.3cm}
|
||
\begin{lstlisting}
|
||
llama70 = mlsysim.Models.Language.Llama3_70B
|
||
fleet = mlsysim.Systems.Clusters.Research_256
|
||
for tp in [1, 2, 4, 8]:
|
||
for pp in [1, 2, 4, 8]:
|
||
if tp * pp > 64: continue
|
||
r = dist.solve(llama70, fleet, batch_size=512,
|
||
tp_size=tp, pp_size=pp, seq_len=4096,
|
||
microbatch_count=max(4, 64//(tp*pp)))
|
||
print(f"TP={tp} PP={pp} eff={r.scaling_efficiency:.1%}")
|
||
\end{lstlisting}
|
||
\end{frame}
|
||
|
||
% --- Slide 4.14: Straggler Effects ---
|
||
\begin{frame}{Stragglers: The Slowest GPU Sets the Pace}
|
||
\note{[2 min] ``In synchronous training, every GPU must finish
|
||
before the next step begins. At 1000 GPUs, even 1\% variation
|
||
means 10 GPUs are significantly slow on any given step.''
|
||
% -- FLEX: [OPTIONAL]
|
||
}
|
||
|
||
\small
|
||
\textbf{Synchronous training:} step time $=$ $\max_i(T_i)$
|
||
|
||
\vspace{0.2cm}
|
||
\begin{itemize}\setlength\itemsep{2pt}
|
||
\item \textbf{Thermal throttling:} hot GPUs clock down 5--10\%
|
||
\item \textbf{Network congestion:} some AllReduce messages delayed
|
||
\item \textbf{OS jitter:} background tasks steal cycles
|
||
\item \textbf{Memory pressure:} GC pauses in the data pipeline
|
||
\end{itemize}
|
||
|
||
\vspace{0.3cm}
|
||
\textbf{Mitigation strategies:}
|
||
\begin{itemize}\setlength\itemsep{2pt}
|
||
\item Asynchronous SGD (trade accuracy for speed)
|
||
\item Backup workers (redundant computation)
|
||
\item Bounded staleness (allow slight divergence)
|
||
\item \texttt{DistributedModel(straggler\_factor=1.05)} to simulate 5\% drag
|
||
\end{itemize}
|
||
\end{frame}
|
||
|
||
% --- Slide 4.15: Part 4 Key Takeaway ---
|
||
\begin{frame}{Part 4: Key Takeaway}
|
||
\note{[1 min] One sentence. Repeat.
|
||
``Distributed training is a communication problem disguised
|
||
as a compute problem.''
|
||
% -- FLEX: [CORE]
|
||
}
|
||
|
||
\centering\Large
|
||
|
||
\textbf{Distributed training is a communication problem\\
|
||
disguised as a compute problem.}\\[0.5cm]
|
||
|
||
\normalsize
|
||
\begin{itemize}
|
||
\item 3D parallelism (DP $\times$ TP $\times$ PP) decomposes the problem.
|
||
\item TP needs NVLink (within node). DP works over InfiniBand (across nodes).
|
||
\item Pipeline bubbles shrink with more microbatches.
|
||
\item Reliability degrades as $\text{MTBF}/N$ --- checkpointing is mandatory.
|
||
\item \texttt{DistributedModel.solve()} captures all these effects.
|
||
\end{itemize}
|
||
|
||
\vspace{0.5cm}
|
||
\centering
|
||
\textit{Lunch break --- reconvene at 1:00 PM for Part 5.}
|
||
\end{frame}
|
||
|
||
\end{document}
|