% =============================================================================
% MLSys·im Tutorial Tutorial — Parts 0–4 (Morning Session)
% =============================================================================
\documentclass[aspectratio=169, 12pt]{beamer}
\usepackage{../../../slides/assets/beamerthememlsys}

\mlsyssetup{
  volume       = {Tutorial},
  chapter      = {Tutorial},
  logo         = {../../../slides/assets/img/logo-mlsysbook.png},
  instlogo     = {../../../slides/assets/img/logo-harvard.png},
  chaptertitle = {MLSys·im: First-Principles ML Systems Modeling},
}

% --- Fonts ---
\usepackage{fontspec}
\setsansfont{Helvetica Neue}[
  BoldFont={Helvetica Neue Bold},
  ItalicFont={Helvetica Neue Italic},
  BoldItalicFont={Helvetica Neue Bold Italic},
]
% Use Courier if JetBrains Mono not available
\IfFontExistsTF{JetBrains Mono}{
  \setmonofont{JetBrains Mono}[Scale=0.85]
}{
  \setmonofont{Courier New}[Scale=0.90]
}

% --- Packages ---
\usepackage{amsmath}
\usepackage{booktabs}
\usepackage[table]{xcolor}
\usepackage{listings}
\usepackage{tikz}
\usetikzlibrary{arrows.meta, positioning, calc, decorations.pathreplacing}

% --- Code listings ---
\lstset{
  language=Python,
  basicstyle=\ttfamily\footnotesize,
  keywordstyle=\color{crimson}\bfseries,
  stringstyle=\color{datastroke},
  commentstyle=\color{midgray}\itshape,
  backgroundcolor=\color{computeblue!20},
  frame=single,
  rulecolor=\color{computestroke},
  numbers=none,
  breaklines=true,
  columns=fullflexible,
  keepspaces=true,
  showstringspaces=false,
  xleftmargin=4pt,
  xrightmargin=4pt,
  aboveskip=6pt,
  belowskip=4pt,
}

% --- Convenience macros ---
\newcommand{\mlsysim}{\texttt{mlsysim}}
\newcommand{\wallbox}[2]{%
  \begin{block}{#1}#2\end{block}%
}
\newcommand{\PredictStart}{\begin{alertblock}{Predict Before You Peek}}
\newcommand{\PredictEnd}{\end{alertblock}}

% --- Image paths ---
\graphicspath{{images/}}

% --- Section count (must match actual \section{} count) ---
\setcounter{mlsystotalsections}{6}

\title{MLSys·im: First-Principles ML Systems Modeling}
\subtitle{A Hands-On Tutorial}
\author{Vijay Janapa Reddi}
\institute{Harvard University}
\date{Tutorial}

% =============================================================================
\begin{document}

% =============================================================================
% PART 0: WELCOME & SETUP (5 slides)
% =============================================================================
\section{Welcome \& Setup}

% --- Slide 0.1: Title ---
\begin{frame}
\note{[1 min] Welcome attendees, set the tone.
Welcome to the MLSys-im tutorial. Today we will build quantitative
intuition for ML systems from first principles.
% -- FLEX: [CORE] Title slide --- do not skip.
}
\titlepage
\end{frame}

% --- Slide 0.1b: The $200M Question ---
\begin{frame}{The \$200 Million Question}
\note{[3 min] THE HOOK. Open strong. Don't touch your laptop. Look at the audience.}

\centering
\Large
\textbf{Meta spent \$200M training Llama-3-405B.}\\[1cm]

\normalsize
Before a single GPU was purchased:\\[0.3cm]
\begin{itemize}
  \item How would you know \textbf{16,384 H100s} was the right fleet?
  \item How would you know \textbf{405B parameters} was the right model size?
  \item How would you know it would take \textbf{54 days}, not 540?
\end{itemize}

\vfill
\small\textcolor{gray}{We will answer all three questions today --- on your laptop, in under a second, with no GPU.}
\end{frame}

% --- Slide 0.1c: Live Demo Reveal ---
\begin{frame}[fragile]{Answer in 0.1 Seconds}
\note{[2 min] Run this LIVE. The room should gasp at how fast the answer appears.}

\begin{lstlisting}
import mlsysim

profile = mlsysim.Engine.solve(
    mlsysim.Models.Language.Llama3_8B,
    mlsysim.Hardware.Cloud.H100,
    batch_size=1,
)
print(f"Bottleneck: {profile.bottleneck}")  # Memory
print(f"MFU: {profile.mfu:.3f}")            # 0.003
\end{lstlisting}

\vfill
\centering
\textbf{That took 0.1 seconds. On a laptop. No GPU.}\\[0.2cm]
\small Now imagine doing this for every hardware option, every model size,\\
every parallelism strategy, every region. \textbf{That is mlsysim.}
\end{frame}

% --- Slide 0.2: What You Will Learn Today ---
\begin{frame}{What You Will Learn Today}
\note{[2 min] Walk through objectives quickly. Emphasize that by the
end of the day every attendee will be able to do these five things.
% -- FLEX: [CORE]
}

\small
By the end of this tutorial you will be able to:
\begin{enumerate}
  \item \textbf{Identify} which physical constraint is the binding bottleneck
        for any ML workload on any hardware.
  \item \textbf{Decompose} training and inference time using the Iron Law.
  \item \textbf{Compare} hardware configurations quantitatively with \mlsysim.
  \item \textbf{Reason} about the compute--memory--communication tradeoff space.
  \item \textbf{Estimate} TCO and carbon footprint for a real deployment.
\end{enumerate}

\vspace{0.3cm}
\centering
\textit{All you need is a laptop and} \texttt{pip install mlsysim}
\end{frame}

% --- Slide 0.3: Setup Check ---
\begin{frame}[fragile]{Setup: Install \& Verify}
\note{[3 min] Give attendees 2 minutes to run these commands.
Walk around and help anyone with pip issues.
If someone cannot install, they can pair with a neighbor.
% -- FLEX: [CORE] --- must verify before proceeding.
}

\small
Open a terminal and run:

\begin{lstlisting}
pip install mlsysim
python3 -c "import mlsysim; print(mlsysim.__version__)"
# Expected output: 0.1.0
\end{lstlisting}

\vspace{0.3cm}
Then run the hello-world sanity check:

\begin{lstlisting}
import mlsysim
model = mlsysim.Models.Language.Llama3_8B
hw    = mlsysim.Hardware.Cloud.H100
prof  = mlsysim.Engine.solve(model, hw, batch_size=1)
print(prof.bottleneck)   # -> "Memory"
\end{lstlisting}

\vspace{0.2cm}
\centering
\alert{If you see \texttt{Memory}, you are ready.}

\vspace{0.3cm}
\footnotesize
\textit{Convention for the rest of the day:}\\
\texttt{import mlsysim} is assumed.
We use \texttt{llama} $=$ \texttt{mlsysim.Models.Language.Llama3\_8B}
and \texttt{hw} $=$ \texttt{mlsysim.Hardware.Cloud.H100} as shorthands.
\end{frame}

% --- Slide 0.4: The 22-Wall Taxonomy ---
\begin{frame}{The 22 Physical Walls of ML Systems}
\note{[2 min] This is the road map for the day. Point out that
we will hit walls 1--7 (Node) before lunch and walls 8--22
after lunch. Each wall has one equation and one mlsysim solver.
Ask: ``How many of these walls have you personally hit?'' Show of hands.
% -- FLEX: [CORE]
}

\small
\begin{columns}[T]
\begin{column}{0.48\textwidth}
\textbf{Domain 1: Node}\\[2pt]
\begin{enumerate}\setlength\itemsep{1pt}
  \item Compute Wall
  \item Memory Wall
  \item Software Wall (MFU)
  \item Serving Wall
  \item Batching Wall (KV cache)
  \item Streaming Wall
  \item Tail Latency Wall
\end{enumerate}

\vspace{0.2cm}
\textbf{Domain 2: Data}\\[2pt]
\begin{enumerate}\setlength\itemsep{1pt}\setcounter{enumi}{7}
  \item Ingestion Wall
  \item Transformation Wall
  \item Locality Wall
\end{enumerate}
\end{column}

\begin{column}{0.48\textwidth}
\textbf{Domain 3: Algorithm}\\[2pt]
\begin{enumerate}\setlength\itemsep{1pt}\setcounter{enumi}{10}
  \item Complexity Wall (Chinchilla)
  \item Reasoning Wall
  \item Fidelity Wall (Compression)
\end{enumerate}

\vspace{0.2cm}
\textbf{Domain 4: Fleet}\\[2pt]
\begin{enumerate}\setlength\itemsep{1pt}\setcounter{enumi}{13}
  \item Communication Wall
  \item Fragility Wall
  \item Multi-Tenant Wall
\end{enumerate}

\vspace{0.2cm}
\textbf{Domain 5: Operations}\\[2pt]
\begin{enumerate}\setlength\itemsep{1pt}\setcounter{enumi}{16}
  \item Capital Wall (TCO)
  \item Sustainability Wall
  \item Checkpoint Wall
  \item Safety Wall
\end{enumerate}
\end{column}
\end{columns}
\end{frame}

% --- Slide 0.5: The Iron Law (Preview) ---
\mlsysfocus{The Iron Law of ML Systems}{%
\[
T = \frac{\text{FLOPs}}{N \;\times\; \text{Peak} \;\times\; \text{MFU} \;\times\; \eta_{\text{scaling}} \;\times\; \text{Goodput}}
\]
\\[0.5cm]
\normalsize
Every wall maps to one of these five denominator terms.\\
This single equation is our compass for the entire day.
}

% =============================================================================
% RELATED WORK & POSITIONING (8 slides)
% =============================================================================
\input{related_work}

% --- Roadmap: You Are Here (Morning) ---
\begin{frame}{Roadmap: You Are Here}
\note{[1 min] Quick orientation. We just finished the setup. Now the real work begins.}

\centering\small
\begin{tabular}{rll}
\toprule
\textbf{Time} & \textbf{Part} & \textbf{Status} \\
\midrule
9:00--9:30   & Part 0: Welcome \& Setup     & \checkmark Done \\
\rowcolor{crimson!12}
9:30--10:30  & \textbf{Part 1: Iron Law \& Roofline} & \textbf{$\leftarrow$ You are here} \\
10:45--11:45 & Part 2: Memory Walls \& Serving & \\
11:45--12:00 & Part 3: Compression & \\
\midrule
\textit{12:00--1:00} & \textit{Lunch} & \\
\midrule
1:00--2:15   & Part 4: Going Distributed & \\
2:30--3:15   & Part 5: Economics \& Sustainability & \\
3:15--3:45   & Part 6: Design Space Exploration & \\
3:45--4:15   & Part 7: TinyML to Frontier & \\
4:15--4:45   & Part 8: Advanced Topics & \\
4:45--5:00   & Part 9: Wrap-Up \& Capstone & \\
\bottomrule
\end{tabular}
\end{frame}

% =============================================================================
% PART 1: THE IRON LAW & ROOFLINE (15 slides)
% =============================================================================
\section{Iron Law \& Roofline}

% --- Slide 1.1: Key Question ---
\begin{frame}{Key Question}
\note{[1 min] Pose the question dramatically. Pause for 5 seconds.
``This is the most important question in ML systems engineering.
By the end of this section you will answer it in 3 lines of Python.''
% -- FLEX: [CORE]
}

\centering
\Large\bfseries
Why doesn't doubling FLOPS\\[0.3cm]
double your throughput?
\end{frame}

% --- Slide 1.2: Constraints Drive Architecture ---
\begin{frame}{Constraints Drive Architecture}
\note{[2 min] ``You don't choose a Transformer because it's trendy;
you choose it because of how it parallelizes on real silicon.
AI is not magic --- it is infrastructure, and infrastructure has laws.''
% -- FLEX: [CORE]
}

\small
\begin{itemize}
  \item Hardware has \textbf{finite compute} (FLOPS), \textbf{finite bandwidth}
        (GB/s), and \textbf{finite memory} (GB).
  \item Every workload demands some amount of each.
  \item The \textbf{binding constraint} is the one that takes the longest.
  \item \alert{You optimize the bottleneck, not the fast part.}
\end{itemize}

\vspace{0.5cm}
\centering
\begin{tikzpicture}[>=Stealth, node distance=3cm]
  \node[draw, fill=computeblue, rounded corners, minimum width=2.5cm, minimum height=1cm]
    (compute) {\textbf{Compute}};
  \node[draw, fill=datagreen, rounded corners, minimum width=2.5cm, minimum height=1cm,
    right=of compute] (memory) {\textbf{Memory BW}};
  \node[draw, fill=routingorange, rounded corners, minimum width=2.5cm, minimum height=1cm,
    right=of memory] (network) {\textbf{Network}};
  \draw[->, thick, crimson] (compute) -- node[above, font=\scriptsize] {which is slowest?} (memory);
  \draw[->, thick, crimson] (memory) -- node[above, font=\scriptsize] {which is slowest?} (network);
\end{tikzpicture}
\end{frame}

% --- Slide 1.3: The Roofline Model ---
\begin{frame}{The Roofline Model (Williams et al., 2009)}
\note{[3 min] Draw the two regimes on the board. Left = memory-bound,
right = compute-bound. The ridge point is where they cross.
``Before I show numbers: if a model does 16B FLOPs and
loads 16 GB of weights, what is its arithmetic intensity?''
Expected: 1 FLOP/byte. That is far left on the Roofline.
WARN: Students conflate FLOPS with throughput.
% -- FLEX: [CORE]
}

\small
\[
\text{Attainable FLOPS} = \min\!\bigl(\text{Peak FLOPS},\;\;
\text{BW} \times \text{Arithmetic Intensity}\bigr)
\]

\vspace{0.2cm}
\begin{columns}[T]
\begin{column}{0.55\textwidth}
\textbf{Arithmetic Intensity} (AI):
\[
  \text{AI} = \frac{\text{FLOPs}}{\text{Bytes moved}}
  \;\;\bigl[\text{FLOP/byte}\bigr]
\]

\vspace{0.2cm}
\begin{itemize}\setlength\itemsep{2pt}
  \item \textbf{AI $<$ Ridge Point} $\Rightarrow$ \colorbox{datagreen}{Memory-bound}
  \item \textbf{AI $>$ Ridge Point} $\Rightarrow$ \colorbox{computeblue}{Compute-bound}
\end{itemize}

\vspace{0.2cm}
Ridge Point $=$ Peak FLOPS $/$ Peak BW
\end{column}

\begin{column}{0.42\textwidth}
\centering
\begin{tikzpicture}[scale=0.7]
  % axes
  \draw[->, thick] (0,0) -- (6.5,0) node[right, font=\scriptsize] {AI (FLOP/B)};
  \draw[->, thick] (0,0) -- (0,4.5) node[above, font=\scriptsize] {GFLOPS};
  % memory roof
  \draw[very thick, datastroke] (0,0) -- (3,3);
  % compute roof
  \draw[very thick, computestroke] (3,3) -- (6.2,3);
  % ridge point
  \fill[crimson] (3,3) circle (3pt);
  \node[above right, font=\scriptsize, crimson] at (3,3) {Ridge};
  % labels
  \node[font=\scriptsize, datastroke, rotate=42] at (1.2,1.7) {BW-limited};
  \node[font=\scriptsize, computestroke] at (4.8,3.4) {Compute-limited};
\end{tikzpicture}
\end{column}
\end{columns}
\end{frame}

% --- Slide 1.4: The Compute Wall ---
\begin{frame}{Wall 1: The Compute Wall}
\note{[2 min] ``This is the speed limit. No software trick can make
your model run faster than the chip can crunch numbers.''
% -- FLEX: [CORE]
}

\small
\wallbox{The Compute Wall}{
\[
  T_{\text{compute}} = \frac{\text{Operations}}{\text{Peak FLOPS} \times \text{Efficiency}}
\]
}

\vspace{0.2cm}
\textbf{Example:} ResNet-50 inference at batch 256 on H100

\begin{itemize}
  \item FLOPs = $8.0 \times 10^{9} \times 256 = 2.05 \times 10^{12}$
  \item H100 FP16 Peak = 989 TFLOPS
  \item At 50\% MFU: $T = \frac{2.05 \times 10^{12}}{989 \times 10^{12} \times 0.5} \approx 4.1\;\text{ms}$
\end{itemize}

\vspace{0.2cm}
\alert{The chip is the ceiling. MFU is how close you get to it.}
\end{frame}

% --- Slide 1.5: The Memory Wall ---
\begin{frame}{Wall 2: The Memory Wall}
\note{[2 min] ``Quick mental math: 16 GB model, 3.35 TB/s bandwidth.
How long to load?'' Give 10 seconds. Expected: 16/3350 = 4.8 ms.
WARN: Students assume compute is always the bottleneck because
GPUs are marketed on TFLOPS.
% -- FLEX: [CORE]
}

\small
\wallbox{The Memory Wall}{
\[
  T_{\text{memory}} = \frac{\text{Weight Bytes}}{\text{Memory Bandwidth}}
\]
}

\vspace{0.2cm}
\textbf{Example:} Llama-3 8B at batch size 1 on H100

\begin{itemize}
  \item Weight size (FP16) = $8\text{B} \times 2\;\text{bytes} = 16\;\text{GB}$
  \item H100 HBM3 BW = 3.35 TB/s
  \item $T = \frac{16}{3350} \approx 4.8\;\text{ms}$ just to load weights
  \item Meanwhile, compute finishes in $\sim$0.03 ms
\end{itemize}

\vspace{0.2cm}
\alert{At batch size 1, LLM inference is $\sim$\,160$\times$ memory-bound.}
\end{frame}

% --- Slide 1.6: Predict Before You Peek #1 ---
\begin{frame}{Predict: H100 vs MI300X vs Gaudi\,3}
\note{[3 min] PREDICTION. Give the audience 60 seconds to think.
Expected answer: all memory-bound. BW ratios determine speedup, not FLOPS.
After the reveal, hammer home: ``The bottleneck determines the speedup.''
Turn to your neighbor: did you get the same answer? Why or why not? 60 seconds.
% -- FLEX: [CORE]
}

\centering
\Large\bfseries
Three flagship accelerators. Same workload.\\[0.3cm]
\normalsize
Llama-3 8B, batch size 1, FP16 inference.\\
Which is fastest---and by how much?\\[0.3cm]

\pause

\scriptsize
\begin{tabular}{lccc}
\toprule
              & \textbf{H100 (NVIDIA)} & \textbf{MI300X (AMD)} & \textbf{Gaudi\,3 (Intel)} \\
\midrule
Peak FP16     & 989 TFLOPS    & 1,307 TFLOPS  & 1,835 TFLOPS \\
HBM BW        & 3.35 TB/s     & 5.3 TB/s      & 3.7 TB/s \\
HBM Capacity  & 80 GB         & 192 GB        & 128 GB \\
\midrule
Bottleneck    & Memory        & Memory        & Memory \\
Weight-load time & 4.8 ms     & 3.0 ms        & 4.3 ms \\
\textbf{Speedup vs H100} & ---  & \textbf{1.6$\times$} & \textbf{1.1$\times$} \\
\bottomrule
\end{tabular}

\vspace{0.3cm}
\pause
\small
\alert{MI300X has fewer FLOPS than Gaudi\,3 but wins on bandwidth.\\
FLOPS don't determine speed when memory-bound.}
\end{frame}

% --- Slide 1.7: Live Demo --- Engine.solve (multi-vendor) ---
\begin{frame}[fragile]{Live Demo: Three Vendors, One API}
\note{[3 min] Run this live. The key moment: all three are memory-bound.
The ranking follows bandwidth, not FLOPS. This is ISCA---show that
mlsysim is not an NVIDIA-only tool.
% -- FLEX: [CORE]
}

\small
Run this in your Python session:

\begin{lstlisting}
import mlsysim

model = mlsysim.Models.Language.Llama3_8B
for hw_name in ["H100", "MI300X", "Gaudi3"]:
    hw = getattr(mlsysim.Hardware.Cloud, hw_name)
    p = mlsysim.Engine.solve(model, hw, batch_size=1)
    print(f"{hw.name}: {p.bottleneck}, "
          f"{p.latency:.2f}")
\end{lstlisting}

\vspace{0.2cm}
\begin{exampleblock}{What to look for}
  \begin{itemize}
    \item \texttt{bottleneck}: Memory for \textbf{all three}
    \item Ranking follows BW (MI300X $>$ Gaudi\,3 $>$ H100), not FLOPS
    \item Same API, same physics, different silicon
  \end{itemize}
\end{exampleblock}
\end{frame}

% --- Slide 1.8: MFU --- The Software Wall ---
\begin{frame}{Wall 3: MFU --- The Software Wall}
\note{[2 min] ``MFU measures the gap between what the hardware could do
and what your software actually achieves. A 50\% MFU means you are
paying for twice the hardware you are using.''
% -- FLEX: [CORE]
}

\small
\wallbox{Model FLOPs Utilization}{
\[
  \text{MFU} = \frac{\text{Achieved FLOPS}}{\text{Peak FLOPS}}
\]
}

\vspace{0.2cm}
\begin{columns}[T]
\begin{column}{0.55\textwidth}
\textbf{What eats MFU?}
\begin{itemize}\setlength\itemsep{2pt}
  \item Kernel launch overhead
  \item Memory stalls (cache misses)
  \item Framework overhead (Python $\to$ CUDA)
  \item Suboptimal operator fusion
  \item \alert{Being memory-bound} (the biggest one!)
\end{itemize}
\end{column}
\begin{column}{0.42\textwidth}
\centering
\textbf{Typical MFU ranges}\\[4pt]
\scriptsize
\begin{tabular}{lc}
\toprule
Workload & MFU \\
\midrule
LLM training (optimized) & 40--55\% \\
LLM inference (bs=1) & $<$5\% \\
ResNet training & 30--40\% \\
FlashAttention & 60--75\% \\
\bottomrule
\end{tabular}
\end{column}
\end{columns}

\vspace{0.2cm}
\alert{Improving MFU is often cheaper than buying more GPUs.}
\end{frame}

% --- Slide 1.8b: What Is Eta? ---
\begin{frame}{What Is $\eta$? (The Efficiency Parameter)}
\note{[3 min] CRITICAL — every demo uses eta. Explain it ONCE here, then it's understood for the day.
ANALOGY: ``eta is to ML systems what CPI is to CPU design --- an empirical constant that bridges peak specs and reality.''}

\small
\textbf{$\eta$ = Achieved FLOPS / Peak FLOPS} (Model FLOPs Utilization)

\vspace{0.3cm}
The gap between what your hardware \emph{could} do and what it \emph{actually} does.

\vspace{0.3cm}
\begin{columns}[T]
\begin{column}{0.48\textwidth}
\textbf{What reduces $\eta$:}
\begin{itemize}\setlength\itemsep{1pt}
\item Kernel launch overhead
\item SM occupancy limits
\item Memory coalescing misses
\item Framework overhead (Python GIL)
\item Communication stalls
\end{itemize}
\end{column}
\begin{column}{0.48\textwidth}
\textbf{Typical values:}

\scriptsize
\begin{tabular}{@{}lr@{}}
\toprule
Scenario & $\eta$ \\
\midrule
Training (Megatron-LM) & 0.40--0.55 \\
Training (PyTorch eager) & 0.08--0.15 \\
Inference decode, bs=1 & 0.01--0.05 \\
Inference decode, bs=32+ & 0.15--0.35 \\
Inference prefill & 0.30--0.50 \\
TinyML (TFLite Micro) & 0.05--0.15 \\
\bottomrule
\end{tabular}
\end{column}
\end{columns}

\vfill
\centering
\small\textcolor{gray}{You do not predict $\eta$ --- you measure it once and use it for what-if analysis.}
\end{frame}

% --- Slide 1.9: The Iron Law (Full) ---
\begin{frame}{The Iron Law of ML Systems}
\note{[3 min] Walk through each denominator term. Point out that every
wall in the 22-wall taxonomy maps to exactly one term.
``Which term do you think is hardest to improve?''
% -- FLEX: [CORE]
}

\[
T = \frac{\text{FLOPs}}{N \;\times\; \text{Peak} \;\times\; \text{MFU}
  \;\times\; \eta_{\text{scaling}} \;\times\; \text{Goodput}}
\]

\vspace{0.3cm}
\small
\begin{tabular}{llll}
\toprule
\textbf{Term} & \textbf{Meaning} & \textbf{Reduced by} & \textbf{Walls} \\
\midrule
$N$ & Number of devices & Budget & --- \\
Peak & Raw hardware speed & GPU generation & 1 (Compute) \\
MFU & Software efficiency & FlashAttention, fusion & 2--3 \\
$\eta_{\text{scaling}}$ & Communication loss & BW, gradient compression & 14--16 \\
Goodput & Failure overhead & Checkpointing, FT & 15, 19 \\
\bottomrule
\end{tabular}

\vspace{0.3cm}
\centering
\textit{Every wall in the taxonomy attacks one of these five terms.}
\end{frame}

% --- Slide 1.10: Arithmetic Intensity Deep Dive ---
\begin{frame}{Arithmetic Intensity: The Dial You Control}
\note{[2 min] ``Batch size is the primary knob. Each additional sample
in the batch reuses the same weights that are already loaded.
The compute grows linearly but memory stays constant.''
% -- FLEX: [CORE]
}

\small
\[
  \text{AI} = \frac{\text{FLOPs}}{\text{Bytes}} \approx
  \frac{2 \times \text{Params} \times B}{
    \underbrace{\text{Params} \times \text{bpp}}_{\text{weights}} +
    \underbrace{\text{Activations}(B)}_{\text{grows with } B}}
\]

\vspace{0.2cm}
\textbf{The batch-size knob:}
\begin{itemize}
  \item At $B=1$: AI $\approx$ 1 FLOP/byte $\Rightarrow$ \textbf{memory-bound}
  \item At $B=32$: AI $\approx$ 32 FLOP/byte $\Rightarrow$ approaching ridge
  \item At $B=256$: AI $\gg$ ridge $\Rightarrow$ \textbf{compute-bound}
\end{itemize}

\vspace{0.2cm}
\centering
\begin{tikzpicture}[scale=0.65]
  \draw[->, thick] (0,0) -- (7,0) node[right, font=\scriptsize] {Batch size};
  \draw[->, thick] (0,0) -- (0,4) node[above, font=\scriptsize] {Throughput};
  \draw[very thick, datastroke] (0.3,0.3) -- (3,3);
  \draw[very thick, computestroke] (3,3) -- (6.5,3);
  \fill[crimson] (3,3) circle (3pt);
  \node[above, font=\scriptsize, crimson] at (3,3.1) {Ridge};
  \node[below, font=\scriptsize, datastroke] at (1.3,0) {BW-bound};
  \node[below, font=\scriptsize, computestroke] at (5,0) {Compute-bound};
\end{tikzpicture}
\end{frame}

% --- Slide 1.11: Live Demo --- Batch Size Sweep ---
\begin{frame}[fragile]{Live Demo: Finding the Ridge Point}
\note{[3 min] Run this loop live. Show how the bottleneck flips
from Memory to Compute as batch size increases.
Before running, ask: ``At what batch size do you
predict the bottleneck will flip?'' Take guesses.
% -- FLEX: [CORE]
}

\small
\begin{lstlisting}
llama = mlsysim.Models.Language.Llama3_8B
hw    = mlsysim.Hardware.Cloud.H100

for bs in [1, 4, 16, 64, 128, 256]:
    p = mlsysim.Engine.solve(llama, hw, batch_size=bs)
    print(f"bs={bs:>3d}  {p.bottleneck:<8s}  "
          f"MFU={p.mfu:.3f}")
\end{lstlisting}

\vspace{0.3cm}
\begin{exampleblock}{What to observe}
  \begin{itemize}
    \item Bottleneck flips from \texttt{Memory} to \texttt{Compute}
    \item MFU climbs as batch size increases (better hardware utilization)
    \item Latency grows but throughput (tokens/s) improves
  \end{itemize}
\end{exampleblock}
\end{frame}

% --- Slide 1.12: Exercise 1 ---
\begin{frame}[fragile]{Exercise 1: Find the Crossover}
\note{[5 min] Attendees work individually.
At what batch size does Llama-3 8B on H100 transition
from memory-bound to compute-bound?
Expected: around bs=32--64 depending on precision.
Turn to your neighbor: did you get the same answer? Why or why not? 60 seconds.
% -- FLEX: [CORE] --- this is a critical hands-on moment.
}

\centering
\Large\bfseries
Hands-On Exercise\\[0.5cm]
\normalsize

\textbf{Question:} At what batch size does Llama-3 8B on H100
transition from memory-bound to compute-bound?

\vspace{0.3cm}
\begin{lstlisting}
for bs in range(1, 129):
    p = mlsysim.Engine.solve(llama, hw, batch_size=bs)
    if p.bottleneck == "Compute":
        print(f"Crossover at batch size {bs}")
        break
\end{lstlisting}

\vspace{0.3cm}
\textit{Bonus: Try the same on A100. Does the crossover happen
at the same batch size? Why or why not?}
\end{frame}

% --- Slide 1.13: The Ridge Point Explained ---
\begin{frame}{The Ridge Point: Hardware DNA}
\note{[2 min] ``The ridge point is a property of the hardware,
not the workload. It tells you how many FLOPs per byte the chip
can sustain before compute becomes the ceiling.''
% -- FLEX: [CORE]
}

\small
\[
  \text{Ridge Point} = \frac{\text{Peak FLOPS}}{\text{Peak BW}}
  \;\;\bigl[\text{FLOP/byte}\bigr]
\]

\vspace{0.2cm}
\scriptsize
\begin{tabular}{llccc}
\toprule
\textbf{Vendor} & \textbf{Hardware} & \textbf{Peak FP16} & \textbf{HBM BW} & \textbf{Ridge} \\
\midrule
NVIDIA & H100 SXM  & 989 TFLOPS  & 3.35 TB/s & 295 FLOP/B \\
NVIDIA & B200      & 2.25 PFLOPS & 8.0 TB/s  & 281 FLOP/B \\
AMD    & MI300X    & 1,307 TFLOPS & 5.3 TB/s & 247 FLOP/B \\
Intel  & Gaudi\,3  & 1,835 TFLOPS & 3.7 TB/s & 496 FLOP/B \\
\bottomrule
\end{tabular}
\small

\vspace{0.3cm}
\begin{itemize}
  \item Higher ridge $\Rightarrow$ more workloads are memory-bound on this chip
  \item \alert{FLOPS grow faster than bandwidth across GPU generations}
  \item The memory wall is getting \textbf{worse}, not better
\end{itemize}
\end{frame}

% --- Slide 1.14: Predict Before You Peek #2 ---
\begin{frame}{Predict: ResNet-50 vs Llama-3 8B}
\note{[2 min] ``ResNet-50 at batch 256 vs Llama-3 8B at batch 1.
Which is compute-bound and which is memory-bound?''
Give 30 seconds. Expected: ResNet at high batch is compute-bound;
Llama at bs=1 is memory-bound.
Turn to your neighbor: did you get the same answer? Why or why not? 60 seconds.
% -- FLEX: [CORE]
}

\centering
\Large\bfseries
Two workloads on the same H100.\\[0.3cm]
\normalsize
Which is compute-bound? Which is memory-bound?\\[0.5cm]

\pause

\small
\begin{tabular}{lcc}
\toprule
              & \textbf{ResNet-50 (bs=256)} & \textbf{Llama-3 8B (bs=1)} \\
\midrule
Total FLOPs   & $2.05 \times 10^{12}$       & $1.6 \times 10^{10}$ \\
Weight bytes   & 50 MB (FP16)               & 16 GB (FP16) \\
AI (FLOP/B)   & $\sim$41{,}000              & $\sim$1 \\
\midrule
\textbf{Regime} & \colorbox{computeblue}{Compute-bound} & \colorbox{datagreen}{Memory-bound} \\
\bottomrule
\end{tabular}

\vspace{0.3cm}
\pause
\alert{Same hardware, completely different bottlenecks.}\\
\textit{The workload determines the regime, not the GPU.}
\end{frame}

% --- Slide 1.15: Key Takeaway (Part 1) ---
\begin{frame}{Part 1: Key Takeaway}
\note{[1 min] Summarize in one sentence. Repeat it twice.
``The bottleneck determines the speedup. Know your regime.''
% -- FLEX: [CORE]
}

\centering\Large

\textbf{The bottleneck determines the speedup.}\\[0.5cm]

\normalsize
\begin{itemize}
  \item The Roofline model tells you \textit{which} constraint is binding.
  \item Batch size is the primary knob that moves you between regimes.
  \item More FLOPS only helps if you are compute-bound.
  \item More bandwidth only helps if you are memory-bound.
  \item \texttt{Engine.solve()} answers this in one line.
\end{itemize}
\end{frame}

% --- Roadmap: After Break ---
\begin{frame}{Roadmap: You Are Here}
\note{[1 min] Quick orientation after break. We now move from single-op analysis to serving.}

\centering\small
\begin{tabular}{rll}
\toprule
\textbf{Time} & \textbf{Part} & \textbf{Status} \\
\midrule
9:00--9:30   & Part 0: Welcome \& Setup     & \checkmark \\
9:30--10:30  & Part 1: Iron Law \& Roofline  & \checkmark \\
\rowcolor{crimson!12}
10:45--11:45 & \textbf{Part 2: Memory Walls \& Serving} & \textbf{$\leftarrow$ You are here} \\
11:45--12:00 & Part 3: Compression & \\
\bottomrule
\end{tabular}
\end{frame}

% =============================================================================
% PART 2: MEMORY WALLS & SERVING (12 slides)
% =============================================================================
\section{Memory Walls \& Serving}

% --- Slide 2.1: Key Question ---
\begin{frame}{Key Question}
\note{[1 min] ``You have seen that LLM inference at batch 1 is
memory-bound. But serving is more complex than a single forward
pass. What makes LLM serving fundamentally different from
CNN inference?''
% -- FLEX: [CORE]
}

\centering
\Large\bfseries
Why does the first token take 50\,ms\\[0.2cm]
but each next token only takes 5\,ms?
\end{frame}

% --- Slide 2.2: Two Phases of LLM Serving ---
\begin{frame}{Wall 4: Prefill vs Decode --- Two Different Physics}
\note{[3 min] ``Prefill is like reading a book fast (compute-intensive).
Decode is like looking up one word at a time in a dictionary
(memory-intensive). Same model, different bottlenecks.''
% -- FLEX: [CORE]
}

\small
\wallbox{The Serving Wall}{
\begin{tabular}{lcl}
  \textbf{TTFT} (Prefill) & $=$ & $\dfrac{\text{Prefill FLOPs}}{\text{Peak FLOPS} \times \text{MFU}}$
  \quad\colorbox{computeblue}{Compute-bound} \\[10pt]
  \textbf{ITL} (Decode)   & $=$ & $\dfrac{\text{Weight Bytes}}{\text{Bandwidth}}$
  \quad\colorbox{datagreen}{Memory-bound}
\end{tabular}
}

\vspace{0.3cm}
\begin{columns}[T]
\begin{column}{0.48\textwidth}
\textbf{Prefill} (process the prompt)
\begin{itemize}\setlength\itemsep{2pt}
  \item All prompt tokens in parallel
  \item $O(S^2)$ attention + $O(S \cdot P)$ linear
  \item Compute-bound (high AI)
  \item Determines \textbf{TTFT}
\end{itemize}
\end{column}
\begin{column}{0.48\textwidth}
\textbf{Decode} (generate tokens)
\begin{itemize}\setlength\itemsep{2pt}
  \item One token at a time
  \item Must reload all weights per token
  \item Memory-bound (AI $\approx$ 1)
  \item Determines \textbf{ITL}
\end{itemize}
\end{column}
\end{columns}
\end{frame}

% --- Slide 2.3: KV Cache --- The Hidden Consumer ---
\begin{frame}{Wall 5: The KV Cache --- Hidden Memory Consumer}
\note{[3 min] ``Each active request carries its own memory of the
conversation.'' Quick math: how much KV cache does one Llama-3 8B
request at 4K context need in FP16?
Expected: 2 * 32 * 32 * 128 * 4096 * 2 bytes = ~2 GB.
% -- FLEX: [CORE]
}

\small
\wallbox{The Batching Wall}{
\[
  \text{KV cache} = 2 \times L \times H \times d \times S \times B \times \text{bpp}
\]
}

\vspace{0.2cm}
\textbf{Llama-3 8B at 4K context, FP16:}
\begin{itemize}
  \item $2 \times 32 \times 32 \times 128 \times 4096 \times 2\;\text{bytes} \approx 2\;\text{GB}$ per request
  \item H100 has 80 GB HBM --- model weights take 16 GB
  \item Remaining 64 GB $\div$ 2 GB/request $=$ \textbf{$\sim$32 concurrent requests}
\end{itemize}

\vspace{0.2cm}
\begin{alertblock}{The serving paradox}
  You want high batch size (for throughput) but KV cache limits how
  many requests fit in memory. \alert{Memory capacity, not compute, limits concurrency.}
\end{alertblock}
\end{frame}

% --- Slide 2.4: Live Demo --- ServingModel ---
\begin{frame}[fragile]{Live Demo: Two-Phase Serving Analysis}
\note{[2 min] Run ServingModel live. Point out TTFT vs ITL in output.
Show that TTFT is compute-bound and ITL is memory-bound.
% -- FLEX: [CORE]
}

\small
\begin{lstlisting}
serving = mlsysim.ServingModel()
result  = serving.solve(llama, hw,
                        seq_len=4096, batch_size=1)
print(f"TTFT: {result.ttft:~P}")
print(f"ITL:  {result.itl:~P}")
print(f"KV cache/req: {result.kv_cache_per_request:~P}")
\end{lstlisting}

\vspace{0.2cm}
\begin{exampleblock}{Expected output}
  TTFT $\approx$ 20--50 ms (compute-bound),
  ITL $\approx$ 5 ms (memory-bound),
  KV cache per request $\approx$ 2 GB
\end{exampleblock}
\end{frame}

% --- Slide 2.5: Continuous Batching ---
\begin{frame}{Continuous Batching: Don't Wait, Serve}
\note{[2 min] ``In static batching, the GPU waits for the longest
request to finish. In continuous batching, new requests start
as soon as any slot frees up. Throughput can improve 2--5x.''
% -- FLEX: [OPTIONAL] Can summarize quickly if behind schedule.
}

\small
\begin{columns}[T]
\begin{column}{0.48\textwidth}
\textbf{Static batching}
\begin{itemize}\setlength\itemsep{2pt}
  \item Pad all sequences to max length
  \item GPU idle while short requests finish
  \item Throughput limited by longest request
  \item Simple to implement
\end{itemize}
\end{column}
\begin{column}{0.48\textwidth}
\textbf{Continuous batching}
\begin{itemize}\setlength\itemsep{2pt}
  \item Insert new requests per iteration
  \item No padding waste
  \item Throughput 2--5$\times$ higher
  \item Used by vLLM, TGI, TensorRT-LLM
\end{itemize}
\end{column}
\end{columns}

\vspace{0.3cm}
\centering
\begin{tikzpicture}[scale=0.65, >=Stealth]
  % Static
  \node[font=\scriptsize\bfseries] at (0, 2.5) {Static};
  \foreach \i/\len in {0/4, 1/2, 2/3} {
    \fill[computeblue, draw=computestroke] (0.5, \i*0.7) rectangle ({0.5 + \len*0.5}, \i*0.7+0.5);
    \fill[errorfill, draw=errorstroke, opacity=0.5] ({0.5 + \len*0.5}, \i*0.7) rectangle (2.5, \i*0.7+0.5);
  }
  \node[font=\tiny, errorstroke] at (2.8, 0.7) {waste};

  % Continuous
  \node[font=\scriptsize\bfseries] at (5, 2.5) {Continuous};
  \fill[computeblue, draw=computestroke] (5.5, 1.4) rectangle (7.5, 1.9);
  \fill[datagreen, draw=datastroke] (5.5, 0.7) rectangle (6.5, 1.2);
  \fill[routingorange, draw=routingstroke] (6.7, 0.7) rectangle (7.5, 1.2);
  \fill[computeblue, draw=computestroke] (5.5, 0) rectangle (7, 0.5);
  \fill[datagreen, draw=datastroke] (7.2, 0) rectangle (7.5, 0.5);
  \node[font=\tiny, datastroke] at (8, 0.7) {no waste};
\end{tikzpicture}
\end{frame}

% --- Slide 2.6: PagedAttention ---
\begin{frame}[fragile]{PagedAttention: Virtual Memory for KV Cache}
\note{[2 min] ``Just like OS virtual memory pages physical RAM,
PagedAttention pages the KV cache. Non-contiguous blocks mean
no fragmentation, so you can fit more requests.''
% -- FLEX: [OPTIONAL]
}

\small
\textbf{The problem:} Pre-allocated KV cache wastes memory on short sequences.

\textbf{The solution} (Kwon et al., 2023 --- vLLM):
\begin{itemize}
  \item Divide KV cache into fixed-size \textbf{pages} (e.g., 16 tokens each)
  \item Allocate pages on demand, not up front
  \item Non-contiguous storage eliminates fragmentation
  \item Memory utilization improves from $\sim$50\% to $>$95\%
\end{itemize}

\vspace{0.2cm}
\begin{exampleblock}{Impact}
  With the same 80 GB H100, PagedAttention can serve
  \textbf{2--4$\times$ more concurrent requests} than static allocation.
\end{exampleblock}

\vspace{0.2cm}
\begin{lstlisting}
# mlsysim models this with ContinuousBatchingModel
cb = mlsysim.ContinuousBatchingModel()
result = cb.solve(model, hw, seq_len=4096,
                  max_batch_size=32, page_size=16)
print(f"Max concurrent: {result.max_concurrent_requests}")
\end{lstlisting}
\end{frame}

% --- Slide 2.7: Predict Before You Peek #3 ---
\begin{frame}{Predict: How Many Concurrent Requests?}
\note{[2 min] Predict-before-reveal. Give 60 seconds.
H100 80 GB, Llama-3 8B FP16, 4K context. How many concurrent requests?
Expected: weights = 16 GB, remaining = 64 GB.
PagedAttention (95\% util): ~30. Without (50\% util): ~16.
Turn to your neighbor: did you get the same answer? Why or why not? 60 seconds.
% -- FLEX: [CORE]
}

\centering
\Large\bfseries
H100 (80 GB), Llama-3 8B (FP16), 4K context.\\[0.5cm]
\normalsize
How many concurrent requests can you serve?\\[0.3cm]

\pause

\small
\begin{tabular}{lcc}
\toprule
  & \textbf{Static alloc.} & \textbf{PagedAttention} \\
\midrule
KV cache utilization & $\sim$50\% & $\sim$95\% \\
Effective memory/req & $\sim$4 GB & $\sim$2.1 GB \\
Max concurrent & $\sim$16 & $\sim$30 \\
\bottomrule
\end{tabular}

\vspace{0.3cm}
\pause
\alert{PagedAttention nearly doubles serving capacity without changing hardware.}
\end{frame}

% --- Slide 2.8: Speculative Decoding ---
\begin{frame}[fragile]{Speculative Decoding: Betting on the Draft}
\note{[2 min] ``Use a small draft model to guess the next K tokens.
Then verify all K in a single forward pass of the big model.
If the draft is right 70\% of the time and K=5, you effectively
decode ~3.5 tokens per forward pass instead of 1.''
% -- FLEX: [OPTIONAL]
}

\small
\textbf{Insight:} Decode is memory-bound $\Rightarrow$ the GPU has spare compute.

\textbf{Speculative decoding} (Leviathan et al., 2023):
\begin{enumerate}\setlength\itemsep{2pt}
  \item \textbf{Draft} $K$ tokens with a small model (fast, low quality)
  \item \textbf{Verify} all $K$ tokens in one forward pass of the big model
  \item \textbf{Accept} the longest prefix that matches
  \item Speedup $\approx K \times \alpha$ where $\alpha$ is acceptance rate
\end{enumerate}

\vspace{0.2cm}
\begin{lstlisting}
# mlsysim supports speculative decoding
draft = mlsysim.Models.Language.Llama3_8B  # smaller
result = serving.solve(model, hw, seq_len=4096,
          draft_model=draft, draft_acceptance_rate=0.7)
print(f"Speedup: {result.speculative_speedup:.2f}x")
\end{lstlisting}
\end{frame}

% --- Slide 2.9: Disaggregated Serving ---
\begin{frame}[fragile]{Disaggregated Serving: Right Hardware for Each Phase}
\note{[2 min] ``Split prefill and decode onto different node types.
Prefill nodes optimize for FLOPS, decode nodes optimize for
bandwidth. Transfer KV cache over the network between them.''
% -- FLEX: [OPTIONAL]
}

\small
\textbf{Key insight:} Prefill and decode have \textit{opposite} hardware preferences.

\begin{columns}[T]
\begin{column}{0.48\textwidth}
\textbf{Prefill node}
\begin{itemize}\setlength\itemsep{2pt}
  \item Needs high FLOPS
  \item Moderate memory
  \item E.g., H100 at high utilization
\end{itemize}
\end{column}
\begin{column}{0.48\textwidth}
\textbf{Decode node}
\begin{itemize}\setlength\itemsep{2pt}
  \item Needs high BW
  \item Large memory (for KV cache)
  \item E.g., many smaller accelerators
\end{itemize}
\end{column}
\end{columns}

\vspace{0.3cm}
\begin{lstlisting}
# Disaggregated serving in mlsysim
result = serving.solve(model, hw, seq_len=4096,
          decode_hardware=mlsysim.Hardware.Cloud.A100)
print(f"TTFT: {result.ttft:~P}  ITL: {result.itl:~P}")
\end{lstlisting}

\vspace{0.2cm}
\alert{Trade-off: network transfer of KV cache adds latency between phases.}
\end{frame}

% --- Slide 2.10: Exercise 2 ---
\begin{frame}[fragile]{Exercise 2: Serving Capacity Planning}
\note{[5 min] Attendees work in pairs.
How many concurrent Llama-3 8B requests (4K context) can an H100
serve while maintaining ITL < 10ms?
Expected: ~30 with PagedAttention at FP16. Binding: memory capacity.
Turn to your neighbor: did you get the same answer? Why or why not? 60 seconds.
% -- FLEX: [CORE]
}

\centering
\Large\bfseries
Hands-On Exercise\\[0.5cm]
\normalsize

\textbf{Question:} You run Llama-3 8B (FP16) on one H100 with 4K context.\\
Your SLA requires ITL $<$ 10 ms.\\
How many concurrent requests can you serve?

\vspace{0.3cm}
\begin{lstlisting}
cb = mlsysim.ContinuousBatchingModel()
result = cb.solve(llama, hw, seq_len=4096,
                  max_batch_size=64, page_size=16)
print(f"Max concurrent: {result.max_concurrent_requests}")
\end{lstlisting}

\vspace{0.2cm}
\textit{Bonus: What happens if you switch to INT8 precision?
Does concurrency double?}
\end{frame}

% --- Slide 2.11: Fallacies ---
\begin{frame}{Fallacies: Serving Edition}
\note{[2 min] Walk through each fallacy with the quantitative
counter-evidence.
% -- FLEX: [OPTIONAL] --- can trim to 2 fallacies if behind.
}

\small
\textbf{Fallacy:} \textit{``Faster GPUs always reduce latency.''}\\
A 3.2$\times$ FLOPS improvement (A100 $\to$ H100) yields only
1.7$\times$ ITL improvement because decode is memory-bound.

\vspace{0.3cm}
\textbf{Fallacy:} \textit{``Doubling memory doubles serving capacity.''}\\
Weights are fixed overhead. Going from 80 GB to 160 GB adds 80 GB,
but KV cache per request stays $\sim$2 GB. Capacity goes from $\sim$32
to $\sim$72 ($\sim$2.3$\times$), not 2$\times$.

\vspace{0.3cm}
\textbf{Fallacy:} \textit{``Batch size 1 is fine for LLM inference.''}\\
At bs=1, MFU $<$ 5\%. You are paying for 989 TFLOPS and using $<$ 50.
Continuous batching can recover 10--20$\times$ throughput.
\end{frame}

% --- Slide 2.12: Part 2 Key Takeaway ---
\begin{frame}{Part 2: Key Takeaway}
\note{[1 min] One sentence summary, repeat twice.
% -- FLEX: [CORE]
}

\centering\Large

\textbf{LLM serving has two phases\\with opposite bottlenecks.}\\[0.5cm]

\normalsize
\begin{itemize}
  \item \textbf{Prefill} (TTFT) is compute-bound --- optimize with parallelism.
  \item \textbf{Decode} (ITL) is memory-bound --- optimize with bandwidth.
  \item \textbf{KV cache} limits concurrency --- optimize with PagedAttention.
  \item \texttt{ServingModel.solve()} decomposes both phases in one call.
\end{itemize}
\end{frame}

% =============================================================================
% PART 3: COMPRESSION & EFFICIENCY (8 slides)
% =============================================================================
\section{Compression \& Efficiency}

% --- Slide 3.1: Key Question ---
\begin{frame}{Key Question}
\note{[1 min] ``If we can shrink the model, we move less data,
and the memory wall recedes. But there is a catch.''
% -- FLEX: [CORE]
}

\centering
\Large\bfseries
Can you make the model 4$\times$ smaller\\[0.2cm]
and get a 4$\times$ speedup?
\end{frame}

% --- Slide 3.2: Wall 13 --- Compression ---
\begin{frame}{Wall 13: The Fidelity Wall}
\note{[2 min] ``Storage always shrinks. But inference speedup
depends on the method. This distinction trips up everyone.''
% -- FLEX: [CORE]
}

\small
\wallbox{The Fidelity Wall}{
\[
  \text{Compression}_{\text{quant}} = \frac{32}{\text{bits}},
  \qquad
  \text{Compression}_{\text{prune}} = \frac{1}{1 - \text{sparsity}}
\]
}

\vspace{0.3cm}
\begin{tabular}{lccc}
\toprule
\textbf{Method} & \textbf{Storage} & \textbf{Speedup} & \textbf{Accuracy} \\
\midrule
FP32 $\to$ FP16 & 2$\times$ & 2$\times$ & $\sim$0\% loss \\
FP16 $\to$ INT8  & 2$\times$ & 1.5--2$\times$ & $<$1\% loss \\
FP16 $\to$ INT4  & 4$\times$ & 2--3$\times$ & 2--5\% loss \\
50\% unstructured prune & 2$\times$ & \alert{1$\times$ (no speedup!)} & 1--3\% loss \\
50\% structured prune & 2$\times$ & $\sim$2$\times$ & 2--5\% loss \\
2:4 N:M sparsity & 2$\times$ & 2$\times$ & 1--2\% loss \\
\bottomrule
\end{tabular}

\vspace{0.2cm}
\alert{Unstructured pruning saves storage but gives zero GPU speedup.\\
Only structured patterns accelerate hardware execution.}
\end{frame}

% --- Slide 3.3: Quantization Deep Dive ---
\begin{frame}{Quantization: Trading Bits for Speed}
\note{[2 min] Walk through the precision ladder.
``If you quantize Llama-3 8B from FP16 to INT4, how much memory?''
Expected: 8B * 0.5 bytes = 4 GB. Down from 16 GB.
% -- FLEX: [CORE]
}

\small
\begin{columns}[T]
\begin{column}{0.55\textwidth}
\textbf{The precision ladder:}

\vspace{0.2cm}
\begin{tikzpicture}[scale=0.8, >=Stealth]
  \foreach \i/\label/\bits/\color in {
    0/FP32/32/errorfill,
    1/FP16\slash BF16/16/routingorange,
    2/INT8\slash FP8/8/computeblue,
    3/INT4/4/datagreen} {
    \fill[\color, draw=midgray] (0, 3-\i) rectangle ({0.15*\bits}, 3.6-\i);
    \node[right, font=\footnotesize] at ({0.15*\bits + 0.2}, 3.3-\i) {\label};
  }
  \draw[->, thick] (0, -0.5) -- (5.5, -0.5) node[right, font=\scriptsize] {Size};
  \node[font=\scriptsize, midgray] at (2.5, -1) {$\longleftarrow$ smaller is better};
\end{tikzpicture}
\end{column}
\begin{column}{0.42\textwidth}
\textbf{Llama-3 8B memory:}
\scriptsize
\begin{tabular}{lc}
\toprule
Precision & Weight Size \\
\midrule
FP32 & 32 GB \\
FP16 & 16 GB \\
INT8 & 8 GB \\
INT4 & 4 GB \\
\bottomrule
\end{tabular}

\vspace{0.3cm}
\normalsize
At INT4, Llama-3 8B fits in\\
a \textbf{laptop GPU} (6 GB).
\end{column}
\end{columns}
\end{frame}

% --- Slide 3.4: Live Demo --- CompressionModel ---
\begin{frame}[fragile]{Live Demo: Quantization Impact}
\note{[2 min] Run CompressionModel live. Show storage savings
and speedup side by side.
% -- FLEX: [CORE]
}

\small
\begin{lstlisting}
comp = mlsysim.CompressionModel()
for bits in [16, 8, 4]:
    r = comp.solve(llama, hw, method="quantization",
                   target_bitwidth=bits)
    print(f"INT{bits}: size={r.compressed_size:~P}  "
          f"speedup={r.inference_speedup:.1f}x")
\end{lstlisting}

\vspace{0.2cm}
\begin{exampleblock}{What to observe}
  \begin{itemize}
    \item Storage shrinks linearly with bit reduction
    \item Speedup follows storage for quantization (structured by nature)
    \item Accuracy degrades modestly at INT8, more at INT4
  \end{itemize}
\end{exampleblock}
\end{frame}

% --- Slide 3.5: Structured vs Unstructured Pruning ---
\begin{frame}[fragile]{Pruning: The Structure Matters}
\note{[2 min] ``Unstructured pruning zeros out individual weights.
The matrix is still the same shape, so the GPU does the same
number of operations. No speedup! Structured pruning removes
entire rows/columns, physically shrinking the matrix.''
WARN: Students assume any compression = speedup.
% -- FLEX: [CORE]
}

\small
\begin{columns}[T]
\begin{column}{0.48\textwidth}
\textbf{Unstructured}
\begin{itemize}\setlength\itemsep{2pt}
  \item Zero out individual weights
  \item Matrix shape unchanged
  \item GPU does same work (skip zeros? nope)
  \item \alert{Storage savings only}
\end{itemize}
\end{column}
\begin{column}{0.48\textwidth}
\textbf{Structured / N:M}
\begin{itemize}\setlength\itemsep{2pt}
  \item Remove entire rows/columns, or\\
        2:4 pattern (Ampere+)
  \item Physically smaller matrices
  \item GPU hardware support (2:4 $\to$ 2$\times$)
  \item \textbf{Real speedup}
\end{itemize}
\end{column}
\end{columns}

\vspace{0.3cm}
\begin{lstlisting}
for stype in ["unstructured", "structured", "n_m"]:
    r = comp.solve(llama, hw, method="pruning",
            sparsity=0.5, sparsity_type=stype)
    print(f"{stype:>14}: {r.inference_speedup:.1f}x")
\end{lstlisting}
\end{frame}

% --- Slide 3.5b: Predict --- What Does INT4 Change? ---
\begin{frame}{Predict: What Does INT4 Change?}
\note{[1 min] Quick poll. Most say "latency gets better." The real answer is fleet size.
Turn to your neighbor: did you get the same answer? Why or why not? 60 seconds.}
\centering
\Large
\textbf{You quantize Llama-3 70B from FP16 to INT4.}\\[0.5cm]
\normalsize
What is the BIGGEST impact on your serving infrastructure?\\[0.3cm]
\begin{enumerate}[(A)]
  \item Inference latency drops by 4$\times$
  \item Model quality degrades significantly
  \item \textbf{You need half as many GPUs}
  \item Memory bandwidth becomes the bottleneck
\end{enumerate}
\end{frame}

% --- Slide 3.6: Predict Before You Peek #4 ---
\begin{frame}{Predict: INT4 Llama-3 8B on H100}
\note{[2 min] Predict-before-reveal. If you quantize to INT4 at bs=1,
is it still memory-bound? Yes! Load time = 4/3350 = 1.2 ms.
Compute still ~0.03 ms. Still memory-bound but 4x faster.
Turn to your neighbor: did you get the same answer? Why or why not? 60 seconds.
% -- FLEX: [CORE]
}

\centering
\Large\bfseries
Quantize Llama-3 8B to INT4.\\
Run inference at batch size 1 on H100.\\[0.5cm]
\normalsize
Is it still memory-bound?\\[0.3cm]

\pause

\small
\begin{tabular}{lcc}
\toprule
              & \textbf{FP16} & \textbf{INT4} \\
\midrule
Weight size   & 16 GB & 4 GB \\
Load time     & 4.8 ms & 1.2 ms \\
Compute time  & 0.03 ms & 0.03 ms \\
\midrule
Bottleneck    & Memory & \textbf{Still Memory!} \\
Decode speedup & --- & \textbf{4$\times$} \\
\bottomrule
\end{tabular}

\vspace{0.3cm}
\pause
\alert{INT4 gives 4$\times$ faster decode, but the GPU is still memory-bound.\\
The memory wall is that deep.}
\end{frame}

% --- Slide 3.7: Exercise 3 ---
\begin{frame}[fragile]{Exercise 3: Compression Tradeoffs}
\note{[5 min] Compare INT8 quantization vs 50\% structured pruning
for Llama-3 8B on H100. Which gives better speedup per accuracy loss?
Expected: INT8 wins (~2x speedup, <1\% loss vs 2-5\% loss).
Turn to your neighbor: did you get the same answer? Why or why not? 60 seconds.
% -- FLEX: [CORE]
}

\centering
\Large\bfseries
Hands-On Exercise\\[0.5cm]
\normalsize

\textbf{Question:} For Llama-3 8B on H100, which is the better deal?
\begin{enumerate}
  \item INT8 quantization
  \item 50\% structured pruning
\end{enumerate}
Compare speedup per accuracy point lost.

\vspace{0.3cm}
\begin{lstlisting}
r1 = comp.solve(llama, hw, method="quantization",
                target_bitwidth=8)
r2 = comp.solve(llama, hw, method="pruning",
        sparsity=0.5, sparsity_type="structured")
print(f"Quant: {r1.inference_speedup:.1f}x / "
      f"{abs(r1.accuracy_delta):.1%} loss")
print(f"Prune: {r2.inference_speedup:.1f}x / "
      f"{abs(r2.accuracy_delta):.1%} loss")
\end{lstlisting}
\end{frame}

% --- Slide 3.7b: Compression Changes Fleet Architecture ---
\begin{frame}{Compression Changes Fleet Architecture}
\note{[3 min] This is the ``aha'' that compression is architecture, not optimization.
The punchline: INT4 halves your GPU count AND your electricity bill.}

\small
\textbf{Llama-3 70B Serving Fleet:}

\vspace{0.3cm}
\begin{tabular}{@{}lrrr@{}}
\toprule
Precision & Model Size & GPUs Needed & Annual Cost \\
\midrule
FP16 & 140 GB & 4 (TP=4) & \$480K \\
INT8 & 70 GB & 2 (TP=2) & \$240K \\
INT4 & 35 GB & 1 & \$120K \\
\bottomrule
\end{tabular}

\vspace{0.3cm}
\textbf{INT4 doesn't just improve latency --- it eliminates 3 GPUs per replica.}\\
At 100 replicas for 1000 QPS: that's \textbf{300 fewer GPUs} and \textbf{\$36M saved per year}.

\vfill
\centering
\small\textcolor{gray}{This is why quantization is a Day 1 architectural decision, not a Day 100 optimization.}
\end{frame}

% --- Slide 3.8: Part 3 Key Takeaway ---
\begin{frame}{Part 3: Key Takeaway}
\note{[1 min] One sentence. Repeat.
% -- FLEX: [CORE]
}

\centering\Large

\textbf{Storage savings $\neq$ inference speedup.}\\[0.5cm]

\normalsize
\begin{itemize}
  \item Quantization gives both storage and speed gains.
  \item Unstructured pruning gives storage only --- zero GPU speedup.
  \item N:M sparsity (2:4) is the hardware-friendly middle ground.
  \item Even at INT4, LLM decode is \textit{still} memory-bound.
  \item \texttt{CompressionModel.solve()} quantifies the full tradeoff.
\end{itemize}
\end{frame}

% --- Roadmap: After Lunch ---
\begin{frame}{Roadmap: Afternoon Session}
\note{[1 min] Re-energize the room. ``Welcome back. The morning was about
single-node physics. The afternoon is about fleets, money, and carbon.''}

\centering\small
\begin{tabular}{rll}
\toprule
\textbf{Time} & \textbf{Part} & \textbf{Status} \\
\midrule
9:00--12:00  & Parts 0--3: Single Node   & \checkmark Done \\
\midrule
\rowcolor{crimson!12}
1:00--2:15   & \textbf{Part 4: Going Distributed} & \textbf{$\leftarrow$ You are here} \\
2:30--3:15   & Part 5: Economics \& Sustainability & \\
3:15--3:45   & Part 6: Design Space Exploration & \\
3:45--4:15   & Part 7: TinyML to Frontier & \\
4:15--4:45   & Part 8: Advanced Topics & \\
4:45--5:00   & Part 9: Wrap-Up \& Capstone & \\
\bottomrule
\end{tabular}
\end{frame}

% =============================================================================
% PART 4: GOING DISTRIBUTED (15 slides)
% =============================================================================
\section{Going Distributed}

% --- Slide 4.1: Key Question ---
\begin{frame}{Key Question}
\note{[1 min] ``Your model does not fit on one GPU. Or it fits but
training would take a year. Either way, you need more GPUs.
But adding GPUs is not free.''
% -- FLEX: [CORE]
}

\centering
\Large\bfseries
If 1 GPU takes 30 days,\\[0.2cm]
do 1000 GPUs take 43 minutes?
\end{frame}

% --- Slide 4.2: Why Distribute? ---
\begin{frame}{Why Distribute?}
\note{[2 min] ``Two reasons to go distributed: (1) the model does not
fit in one GPU's memory, or (2) you want to finish sooner.
Reason 1 is a hard constraint. Reason 2 is an optimization.''
% -- FLEX: [CORE]
}

\small
\textbf{Reason 1: Model does not fit}
\begin{itemize}
  \item Llama-3 70B FP16 $=$ 140 GB $>$ H100's 80 GB
  \item \alert{Must} split across at least 2 GPUs
\end{itemize}

\vspace{0.3cm}
\textbf{Reason 2: Time-to-train}
\begin{itemize}
  \item 1 H100 training Llama-3 70B $\approx$ 15 GPU-years
  \item 1024 H100s $\approx$ 5 days (if scaling were perfect)
  \item But scaling is \textit{never} perfect...
\end{itemize}

\vspace{0.3cm}
\centering
\begin{tikzpicture}[scale=0.7, >=Stealth]
  \draw[->, thick] (0,0) -- (7,0) node[right, font=\scriptsize] {GPUs};
  \draw[->, thick] (0,0) -- (0,4) node[above, font=\scriptsize] {Speedup};
  \draw[dashed, midgray] (0,0) -- (6.5,3.9) node[right, font=\scriptsize\itshape] {ideal};
  \draw[very thick, crimson] (0,0) .. controls (2,2) and (4,3) .. (6.5,3.2);
  \node[font=\scriptsize, crimson] at (5.5, 2.4) {reality};
\end{tikzpicture}
\end{frame}

% --- Slide 4.3: The Three Dimensions of Parallelism ---
\begin{frame}{3D Parallelism: DP $\times$ TP $\times$ PP}
\note{[3 min] ``Every distributed strategy is a combination of three
dimensions.'' WARN: Students often confuse TP and PP.
TP splits within a layer; PP splits between layers.
% -- FLEX: [CORE]
}

\small
\begin{columns}[T]
\begin{column}{0.32\textwidth}
\textbf{Data Parallel (DP)}
\begin{itemize}\setlength\itemsep{1pt}\footnotesize
  \item Replicate full model
  \item Split data across replicas
  \item AllReduce gradients
  \item \textit{Most common}
\end{itemize}
\end{column}
\begin{column}{0.32\textwidth}
\textbf{Tensor Parallel (TP)}
\begin{itemize}\setlength\itemsep{1pt}\footnotesize
  \item Split each layer's weights
  \item Split activations, not data
  \item AllReduce per layer (2$\times$!)
  \item Needs fast interconnect
\end{itemize}
\end{column}
\begin{column}{0.32\textwidth}
\textbf{Pipeline Parallel (PP)}
\begin{itemize}\setlength\itemsep{1pt}\footnotesize
  \item Split model into stages
  \item Each GPU owns $L/\text{PP}$ layers
  \item Pipeline bubbles
  \item Needs less bandwidth
\end{itemize}
\end{column}
\end{columns}

\vspace{0.3cm}
Total GPUs: $N = \text{DP} \times \text{TP} \times \text{PP}$

\vspace{0.2cm}
\centering
\scriptsize
\begin{tabular}{lccc}
\toprule
\textbf{Property} & \textbf{DP} & \textbf{TP} & \textbf{PP} \\
\midrule
Splits & Data & Weights + Activations & Layers \\
Communication & AllReduce (gradients) & AllReduce (activations) & Point-to-point \\
BW requirement & Moderate & Very high (NVLink) & Low \\
Bubble overhead & None & None & $\sim(P{-}1)/(M{+}P{-}1)$ \\
\bottomrule
\end{tabular}
\end{frame}

% --- Slide 4.3b: AllReduce Concrete Example ---
\begin{frame}[fragile]{AllReduce: A Concrete Example}
\note{[2 min] NUMBERS FIRST, then formula. Students need to see the magnitude before the algebra.}

\small
\textbf{Setup:} 8 H100 GPUs, NVLink at 900 GB/s, Llama-3 8B (16 GB gradients)

\vspace{0.3cm}
\begin{enumerate}
\item Each GPU computes its local gradient: \textbf{16 GB}
\item All 8 GPUs must end up with the \textbf{same averaged gradient}
\item Ring AllReduce passes chunks around the ring\ldots
\end{enumerate}

\vspace{0.3cm}
\begin{lstlisting}
t = mlsysim.core.formulas.calc_ring_allreduce_time(
    message_bytes=16e9,
    n_gpus=8,
    bandwidth_bytes_s=900e9,
    latency_s=500e-9,
)
print(f"AllReduce time: {t.to('ms'):.1f}")
# -> ~35 ms (bandwidth-dominated, latency is negligible)
\end{lstlisting}

\vfill
\centering
\textbf{35 ms} to synchronize 8 GPUs. Now: what happens at 256 GPUs?
\end{frame}

% --- Slide 4.4: Data Parallelism + AllReduce ---
\begin{frame}{Wall 14: The Communication Wall (AllReduce)}
\note{[3 min] ``Quick: 1 GB of gradients, 8 GPUs, 50 GB/s NVLink.
How long for AllReduce?'' Expected: 2*(7/8)*1/50 = 35 ms.
Ring AllReduce sends 2(N-1)/N times the data. As N grows,
this approaches 2x.
% -- FLEX: [CORE]
}

\small
\wallbox{The Communication Wall}{
\[
  T_{\text{AllReduce}} = \frac{2(N-1)}{N} \times \frac{M}{BW}
    + 2(N-1) \times \text{latency}
\]
}

\vspace{0.2cm}
\textbf{Example:} 1 GB gradients, 8$\times$ H100 on NVLink (900 GB/s)

\[
T = \frac{2 \times 7}{8} \times \frac{1}{900} \approx 1.9\;\text{ms}
\]

\vspace{0.2cm}
\textbf{Same gradients}, 256$\times$ H100 across InfiniBand (50 GB/s):

\[
T = \frac{2 \times 255}{256} \times \frac{1}{50} \approx 40\;\text{ms}
\]

\vspace{0.2cm}
\alert{NVLink is 18$\times$ faster than InfiniBand for AllReduce.\\
That is why TP must stay within a node.}
\end{frame}

% --- Slide 4.5: Tensor Parallelism ---
\begin{frame}{Tensor Parallelism: Splitting Layers}
\note{[3 min] ``TP splits each layer's weight matrix across GPUs.
Every forward and backward pass requires 2 AllReduce ops per
layer. That is why TP only works on NVLink, not across nodes.''
% -- FLEX: [CORE]
}

\small
\textbf{How it works:}
\begin{enumerate}\setlength\itemsep{2pt}
  \item Split weight matrix $W$ column-wise across $T$ GPUs
  \item Each GPU computes $Y_i = X \cdot W_i$ (partial result)
  \item AllReduce to combine: $Y = \sum Y_i$
  \item \alert{2 AllReduce ops per layer} (forward + backward)
\end{enumerate}

\vspace{0.3cm}
\textbf{TP overhead:}
\[
  T_{\text{TP}} = 2 \times L \times T_{\text{AllReduce}}(T)
\]

\begin{exampleblock}{Llama-3 70B, TP=8 on NVLink (900 GB/s)}
  \begin{itemize}
    \item 80 layers $\times$ 2 AllReduce $\times$ $\sim$0.1 ms each $\approx$ \textbf{16 ms overhead per step}
    \item This is 10--20\% of a typical training step
  \end{itemize}
\end{exampleblock}
\end{frame}

% --- Slide 4.6: Pipeline Parallelism ---
\begin{frame}{Pipeline Parallelism: The Bubble Problem}
\note{[3 min] ``With 4 stages and 4 microbatches, what fraction
of time is wasted?'' Expected: 3/7 = 43\%. With 32 microbatches:
3/35 = 8.6\%. Lesson: more microbatches = smaller bubble.
% -- FLEX: [CORE]
}

\small
\wallbox{Pipeline Bubble Fraction}{
\[
  \text{Bubble} = \frac{P - 1}{M + P - 1}
\]
where $P$ = pipeline stages, $M$ = microbatches
}

\vspace{0.3cm}
\scriptsize
\begin{tabular}{lcccc}
\toprule
$P$ (stages) & $M$ (microbatches) & Bubble & Effective utilization \\
\midrule
4 & 4 & 43\% & 57\% \\
4 & 16 & 16\% & 84\% \\
4 & 32 & 8.6\% & 91\% \\
8 & 32 & 18\% & 82\% \\
\bottomrule
\end{tabular}

\normalsize
\vspace{0.2cm}
\alert{More microbatches $\Rightarrow$ smaller bubble.\\
But more microbatches = more memory for activations.}
\end{frame}

% --- Slide 4.7: Gradient Accumulation ---
\begin{frame}{Gradient Accumulation: Virtual Batch Size}
\note{[2 min] ``Process K small microbatches and accumulate gradients
before the optimizer step. This fills the pipeline and amortizes AllReduce.''
% -- FLEX: [OPTIONAL]
}

\small
\[
  B_{\text{effective}} = B_{\text{micro}} \times K \times \text{DP}
\]

\textbf{Why accumulate?}
\begin{itemize}\setlength\itemsep{2pt}
  \item Fill the pipeline ($M = K$ microbatches)
  \item Amortize AllReduce cost over $K$ steps
  \item Simulate large batch size without large memory
  \item Trade compute (more forward passes) for communication (fewer AllReduce)
\end{itemize}

\vspace{0.2cm}
\textbf{Example:} DP=128, $B_{\text{micro}}$=4, $K$=8
\begin{itemize}
  \item $B_{\text{effective}} = 4 \times 8 \times 128 = 4096$
  \item AllReduce only once per 8 microbatches
  \item Pipeline bubble: $(P-1)/(8+P-1)$ --- much smaller
\end{itemize}
\end{frame}

% --- Slide 4.8: Hierarchical Communication ---
\begin{frame}{Hierarchical AllReduce: NVLink + InfiniBand}
\note{[2 min] ``Hierarchical AllReduce first reduces within each
node (fast NVLink), then across nodes (slower IB), then
broadcasts back. This exploits the bandwidth hierarchy.''
% -- FLEX: [OPTIONAL]
}

\small
\textbf{Real cluster topology:}

\centering
\begin{tikzpicture}[scale=0.75, >=Stealth,
  gpu/.style={draw, fill=computeblue, rounded corners, minimum width=0.6cm,
              minimum height=0.5cm, font=\tiny},
  node/.style={draw, fill=white, rounded corners=4pt, dashed, inner sep=4pt}]

  % Node 0
  \node[node, label=above:{\scriptsize Node 0}] (n0) at (0,0) {
    \begin{tikzpicture}
      \foreach \i in {0,...,3} {
        \node[gpu] (g0\i) at (\i*0.8, 0) {G\i};
      }
    \end{tikzpicture}
  };

  % Node 1
  \node[node, label=above:{\scriptsize Node 1}] (n1) at (5.5,0) {
    \begin{tikzpicture}
      \foreach \i in {0,...,3} {
        \node[gpu] (g1\i) at (\i*0.8, 0) {G\i};
      }
    \end{tikzpicture}
  };

  % NVLink labels
  \node[font=\tiny, datastroke] at (0, -0.9) {NVLink 900 GB/s};
  \node[font=\tiny, datastroke] at (5.5, -0.9) {NVLink 900 GB/s};

  % IB link
  \draw[very thick, crimson, <->] (2.2, 0) -- (3.3, 0)
    node[midway, above, font=\tiny] {IB 50 GB/s};
\end{tikzpicture}

\vspace{0.3cm}
\flushleft
\small
\textbf{3-step hierarchical AllReduce:}
\begin{enumerate}\setlength\itemsep{1pt}
  \item \textbf{Local reduce} within each node (NVLink --- fast)
  \item \textbf{Global AllReduce} across leader GPUs (InfiniBand --- slow)
  \item \textbf{Local broadcast} within each node (NVLink --- fast)
\end{enumerate}

\alert{TP within node (NVLink). DP across nodes (InfiniBand).}
\end{frame}

% --- Slide 4.9: Live Demo --- DistributedModel ---
\begin{frame}[fragile]{Live Demo: Distributed Training Analysis}
\note{[3 min] Run DistributedModel live. Show communication overhead,
bubble fraction, and scaling efficiency.
% -- FLEX: [CORE]
}

\small
\begin{lstlisting}
fleet = mlsysim.Systems.Clusters.Frontier_8K
dist  = mlsysim.DistributedModel()
result = dist.solve(llama, fleet, batch_size=4096,
         tp_size=8, pp_size=1, microbatch_count=32,
         seq_len=4096)
print(f"Scaling eff:   {result.scaling_efficiency:.1%}")
print(f"Comm overhead: {result.communication_overhead:.1%}")
print(f"Effective MFU: {result.effective_mfu:.1%}")
\end{lstlisting}

\vspace{0.2cm}
\begin{exampleblock}{What to look for}
  Communication overhead + bubble fraction = total efficiency loss.
  Effective MFU $=$ single-node MFU $\times$ scaling efficiency.
\end{exampleblock}
\end{frame}

% --- Slide 4.10: Wall 15 --- The Fragility Wall ---
\begin{frame}{Wall 15: The Fragility Wall (Reliability)}
\note{[2 min] ``If you have 10,000 GPUs each with 50,000 hour MTBF,
what is the cluster MTBF?'' Expected: 50,000/10,000 = 5 hours.
This is why checkpointing exists.
% -- FLEX: [CORE]
}

\small
\wallbox{The Fragility Wall}{
\[
  \text{Cluster MTBF} = \frac{\text{Component MTBF}}{N_{\text{components}}}
\]
}

\vspace{0.2cm}
\begin{tabular}{lcc}
\toprule
\textbf{Scale} & \textbf{GPUs} & \textbf{Cluster MTBF} \\
\midrule
Research lab   & 8     & 260 days \\
Mid cluster    & 256   & 8 days \\
Large cluster  & 1,024 & 2 days \\
Frontier-scale & 8,192 & 6 hours \\
Mega cluster   & 100K  & 30 minutes \\
\bottomrule
\end{tabular}

\vspace{0.2cm}
\alert{At frontier scale, something breaks every 6 hours.\\
Without checkpointing, every failure wastes the entire run since the last save.}
\end{frame}

% --- Slide 4.10b: Predict --- Scaling to 256 GPUs ---
\begin{frame}{Predict: Scaling to 256 GPUs}
\note{[2 min] PREDICTION. Hands up for each answer. Most will say 256x.
Turn to your neighbor: did you get the same answer? Why or why not? 60 seconds.}
\centering
\Large
\textbf{You have 8 H100s doing data-parallel training.}\\[0.5cm]
\textbf{You scale to 256 GPUs.}\\[0.5cm]
\normalsize
How much faster will training be?\\[0.3cm]
\begin{enumerate}[(A)]
  \item 32$\times$ faster (perfect scaling)
  \item 20--25$\times$ faster
  \item 10--15$\times$ faster
  \item \textbf{It depends on the model size}
\end{enumerate}
\end{frame}

% --- Slide 4.11: Scaling Efficiency ---
\begin{frame}{Scaling Efficiency: The Amdahl Trap}
\note{[2 min] ``Scaling efficiency is the fraction of ideal speedup
you actually achieve.'' Includes comm overhead, pipeline
bubbles, stragglers, and failure recovery.
% -- FLEX: [CORE]
}

\small
\[
  \eta_{\text{scaling}} = \frac{\text{Actual speedup}}{N}
  = \frac{1}{1 + \text{comm\_frac} + \text{bubble\_frac} + \text{straggler\_frac}}
\]

\vspace{0.3cm}
\begin{columns}[T]
\begin{column}{0.55\textwidth}
\textbf{What eats scaling efficiency:}
\begin{enumerate}\setlength\itemsep{2pt}
  \item AllReduce communication
  \item Pipeline bubbles
  \item Straggler effects (slowest GPU)
  \item Checkpoint I/O
  \item Failure recovery
\end{enumerate}
\end{column}
\begin{column}{0.42\textwidth}
\centering
\scriptsize
\begin{tabular}{lc}
\toprule
\textbf{System} & $\eta_{\text{scaling}}$ \\
\midrule
8 GPUs (NVLink)  & 95--98\% \\
64 GPUs (IB)     & 85--92\% \\
1024 GPUs        & 70--85\% \\
8192 GPUs        & 55--70\% \\
\bottomrule
\end{tabular}
\end{column}
\end{columns}

\vspace{0.3cm}
\alert{At 8192 GPUs, you lose 30--45\% of your compute to overhead.}
\end{frame}

% --- Slide 4.12: Predict Before You Peek #5 ---
\begin{frame}{Predict: Optimal Parallelism Config}
\note{[2 min] You have 64 H100s. Llama-3 70B (140 GB FP16).
What TP x PP x DP? Give 90 seconds.
Expected: TP=8 (NVLink), PP=1 (no bubbles), DP=8 (64/8).
Turn to your neighbor: did you get the same answer? Why or why not? 60 seconds.
% -- FLEX: [CORE]
}

\centering
\Large\bfseries
64 H100s. Llama-3 70B (140 GB FP16).\\[0.3cm]
\normalsize
What is the optimal TP $\times$ PP $\times$ DP?\\[0.5cm]

\pause

\small
\begin{tabular}{lcccl}
\toprule
\textbf{Config} & \textbf{TP} & \textbf{PP} & \textbf{DP} & \textbf{Why} \\
\midrule
Candidate A & 8 & 1 & 8 & TP within node, no bubbles \\
Candidate B & 4 & 2 & 8 & Less TP comm, but has bubbles \\
Candidate C & 2 & 4 & 8 & Minimal TP, large bubble \\
\bottomrule
\end{tabular}

\vspace{0.3cm}
\pause
\alert{Candidate A is typically best:} TP=8 uses full NVLink bandwidth,
PP=1 avoids pipeline bubbles entirely, DP=8 across nodes.

\vspace{0.2cm}
\textit{Rule of thumb: maximize TP within a node, minimize PP.}
\end{frame}

% --- Slide 4.13: Exercise 4 ---
\begin{frame}[fragile]{Exercise 4: Distributed Training Design}
\note{[5 min] Sweep TP in [1,2,4,8] and PP in [1,2,4,8] for
Llama-3 70B on 64 H100s. Expected: TP=8, PP=1, DP=8.
Turn to your neighbor: did you get the same answer? Why or why not? 60 seconds.
% -- FLEX: [CORE]
}

\centering
\Large\bfseries
Hands-On Exercise\\[0.5cm]
\normalsize

\textbf{Question:} Find the optimal TP$\times$PP for Llama-3 70B on 64 H100s.

\vspace{0.3cm}
\begin{lstlisting}
llama70 = mlsysim.Models.Language.Llama3_70B
fleet   = mlsysim.Systems.Clusters.Research_256
for tp in [1, 2, 4, 8]:
    for pp in [1, 2, 4, 8]:
        if tp * pp > 64: continue
        r = dist.solve(llama70, fleet, batch_size=512,
            tp_size=tp, pp_size=pp, seq_len=4096,
            microbatch_count=max(4, 64//(tp*pp)))
        print(f"TP={tp} PP={pp} eff={r.scaling_efficiency:.1%}")
\end{lstlisting}
\end{frame}

% --- Slide 4.14: Straggler Effects ---
\begin{frame}{Stragglers: The Slowest GPU Sets the Pace}
\note{[2 min] ``In synchronous training, every GPU must finish
before the next step begins. At 1000 GPUs, even 1\% variation
means 10 GPUs are significantly slow on any given step.''
% -- FLEX: [OPTIONAL]
}

\small
\textbf{Synchronous training:} step time $=$ $\max_i(T_i)$

\vspace{0.2cm}
\begin{itemize}\setlength\itemsep{2pt}
  \item \textbf{Thermal throttling:} hot GPUs clock down 5--10\%
  \item \textbf{Network congestion:} some AllReduce messages delayed
  \item \textbf{OS jitter:} background tasks steal cycles
  \item \textbf{Memory pressure:} GC pauses in the data pipeline
\end{itemize}

\vspace{0.3cm}
\textbf{Mitigation strategies:}
\begin{itemize}\setlength\itemsep{2pt}
  \item Asynchronous SGD (trade accuracy for speed)
  \item Backup workers (redundant computation)
  \item Bounded staleness (allow slight divergence)
  \item \texttt{DistributedModel(straggler\_factor=1.05)} to simulate 5\% drag
\end{itemize}
\end{frame}

% --- Slide 4.15: Part 4 Key Takeaway ---
\begin{frame}{Part 4: Key Takeaway}
\note{[1 min] One sentence. Repeat.
``Distributed training is a communication problem disguised
as a compute problem.''
% -- FLEX: [CORE]
}

\centering\Large

\textbf{Distributed training is a communication problem\\
disguised as a compute problem.}\\[0.5cm]

\normalsize
\begin{itemize}
  \item 3D parallelism (DP $\times$ TP $\times$ PP) decomposes the problem.
  \item TP needs NVLink (within node). DP works over InfiniBand (across nodes).
  \item Pipeline bubbles shrink with more microbatches.
  \item Reliability degrades as $\text{MTBF}/N$ --- checkpointing is mandatory.
  \item \texttt{DistributedModel.solve()} captures all these effects.
\end{itemize}

\vspace{0.5cm}
\centering
\textit{Lunch break --- reconvene at 1:00 PM for Part 5.}
\end{frame}

\end{document}