Files
cs249r_book/mlsysim/tutorial/slides/related_work.tex
Vijay Janapa Reddi c7bd160e15 feat(tutorial): multi-vendor examples + slide polish for A+ quality
- Changed A100-vs-H100 comparisons to H100-vs-MI300X-vs-Gaudi3 three-way
- Added Hardware Zoo slide showing all 22 platforms across 6 vendors
- Added timing markers [N min] to speaker notes throughout
- Added "turn to your neighbor" discussion prompts after predict slides
- Added roadmap "You Are Here" slides after each break
- Stripped code boilerplate (import only shown once, aliases for later demos)
- Added figure references for new SVGs (multi-vendor-roofline, parallelism-3d,
  memory-hierarchy, pareto-front)
- Updated related_work.tex comparison table to highlight vendor breadth
2026-04-02 07:23:12 -04:00

363 lines
13 KiB
TeX

% =============================================================================
% MLSys-im ISCA 2026 Tutorial — Related Work & Positioning (~8 slides)
% =============================================================================
% For ISCA distribution: self-contained Metropolis theme.
% For internal use: swap to beamerthememlsys.sty
% =============================================================================
% This file is designed to be \input{related_work} into tutorial_part1.tex
% or tutorial_part2.tex. It contains no preamble or document environment.
% =============================================================================
% =====================================================================
% RELATED WORK & POSITIONING
% =====================================================================
\section{Related Work}
% --- Slide R.1: The Landscape ---
\begin{frame}{The Landscape: Existing ML Performance Tools}
\centering
\scriptsize
\begin{tabular}{@{}lllllll@{}}
\toprule
\textbf{Tool} & \textbf{Type} & \textbf{Speed} & \textbf{Scope} & \textbf{HW Coverage} & \textbf{Access} \\
\midrule
ASTRA-sim & Discrete-event & Hours & Network & NVIDIA & Open \\
Calculon & Analytical & Seconds & Training & NVIDIA & Open \\
DeepSpeed Profiler & Runtime & Real-time & Single-node & NVIDIA & Requires GPU \\
LLMPerf / vLLM & Empirical & Minutes & Serving & NVIDIA, AMD & Requires GPU \\
Roofline Toolkit & Empirical & Minutes & Compute/memory & Intel, NVIDIA & Requires HW \\
Internal (Google, Meta) & Analytical & Seconds & Full-stack & Custom & Proprietary \\
\rowcolor{crimson!12}
\textbf{mlsysim} & \textbf{Analytical} & \textbf{Sub-second} & \textbf{22 walls} & \textbf{5 vendors} & \textbf{Open, no GPU} \\
\bottomrule
\end{tabular}
\vspace{0.8em}
\small
\textbf{Key observations:} (1) Open tools cover 3--4 constraints each.
(2) Most are NVIDIA-only. mlsysim covers \textbf{NVIDIA, AMD, Intel, Google, Cerebras,}
and custom silicon---full stack from compute to carbon.
\end{frame}
% --- Slide R.2: What Makes mlsysim Different? ---
\begin{frame}{What Makes mlsysim Different?}
\begin{columns}[T]
\column{0.55\textwidth}
\begin{enumerate}
\item \textbf{Breadth: 22 walls in one pipeline}\\
\small Others cover 3--4 constraints.\\
mlsysim composes all 22 into a single\\
\texttt{Pipeline.solve()} call.
\vspace{0.6em}
\item \textbf{Unit safety: Pint dimensional analysis}\\
\small Every quantity carries its unit.\\
\texttt{GB + TFLOPS} $\Rightarrow$ \textcolor{crimson}{DimensionalityError}.\\
Catches errors spreadsheets never will.
\vspace{0.6em}
\item \textbf{Traceability: Every constant is cited}\\
\small \texttt{TraceableConstant} records source,\\
date, and DOI for every hardware spec.\\
Audit any number back to its origin.
\end{enumerate}
\column{0.42\textwidth}
\centering
\begin{block}{The Trifecta}
\small
\begin{tabular}{@{}lc@{}}
Feature & Others \\
\midrule
22 walls & 3--4 \\
Unit safety & None \\
Traceability & Rare \\
\end{tabular}
\end{block}
\vspace{1em}
\begin{exampleblock}{One-Liner}
\footnotesize
\texttt{pip install mlsysim}\\[2pt]
No GPU. No cluster.\\
Just Python 3.10+.
\end{exampleblock}
\end{columns}
\end{frame}
% --- Slide R.3: The Fidelity-Speed Spectrum ---
\begin{frame}{The Fidelity--Speed Spectrum}
\centering
\begin{tikzpicture}[
every node/.style={font=\small},
arrow/.style={-{Stealth[length=3mm]}, thick, color=midgray},
tool/.style={draw, rounded corners=3pt, fill=computeblue, text=darktext,
minimum height=0.7cm, minimum width=2.2cm, font=\footnotesize\bfseries},
highlight/.style={tool, fill=crimson!15, draw=crimson, thick},
]
% Axis
\draw[arrow] (0,0) -- (12.5,0) node[right] {\textbf{Speed}};
\draw[arrow] (0,0) -- (0,4.5) node[above] {\textbf{Fidelity}};
% Labels on axis
\node[font=\scriptsize, color=midgray] at (1.5,-0.4) {Hours};
\node[font=\scriptsize, color=midgray] at (5.0,-0.4) {Minutes};
\node[font=\scriptsize, color=midgray] at (8.5,-0.4) {Seconds};
\node[font=\scriptsize, color=midgray] at (11.5,-0.4) {Instant};
% Tools
\node[tool] (cycle) at (1.5, 3.8) {Cycle-accurate};
\node[tool] (astra) at (3.2, 3.0) {ASTRA-sim};
\node[tool] (llmperf) at (4.5, 2.2) {LLMPerf};
\node[tool] (calculon) at (7.0, 2.5) {Calculon};
\node[highlight] (mlsysim) at (9.0, 2.0) {mlsysim};
\node[tool] (napkin) at (11.5, 0.8) {Napkin math};
% Annotation for mlsysim
\node[font=\scriptsize, color=crimson, align=center] at (9.0, 1.0)
{22 walls, sub-second\\composable pipeline};
% Sweet-spot region
\draw[dashed, crimson, thick, rounded corners=6pt]
(7.5, 0.5) rectangle (10.5, 2.8);
\node[font=\scriptsize\itshape, color=crimson] at (9.0, 3.1)
{``Fast analytical'' sweet spot};
\end{tikzpicture}
\vspace{0.3em}
\small\itshape
mlsysim trades cycle-accuracy for speed and breadth.\\
The right tool depends on the question you are asking.
\end{frame}
% --- Slide R.4: What mlsysim Does NOT Do ---
\begin{frame}{What mlsysim Does NOT Do}
\begin{columns}[T]
\column{0.52\textwidth}
\textbf{\textcolor{crimson}{Not a replacement for:}}
\begin{itemize}
\item \textbf{Benchmarking} --- real measurements on real hardware
\item \textbf{Cycle-accurate simulation} --- microarchitectural detail
\item \textbf{Production deployment} --- no orchestration, no serving
\item \textbf{Framework profiling} --- no kernel-level tracing
\end{itemize}
\vspace{0.5em}
\textbf{Prediction accuracy:}\\
Within \alert{2--5$\times$} of measured performance.\\
Useful for \emph{ranking} and \emph{bounding},\\
not for exact latency guarantees.
\column{0.45\textwidth}
\textbf{\textcolor{datastroke}{What it IS:}}
\begin{itemize}
\item A \textbf{reasoning framework}
\item Answers ``which constraint binds?''
\item Answers ``is this hardware sufficient?''
\item Answers ``what should I try first?''
\item Runs in $<$1\,ms per evaluation
\item No GPU required
\end{itemize}
\vspace{0.5em}
\begin{center}
\small\itshape
``All models are wrong,\\
but some are useful.''\\
--- George Box
\end{center}
\end{columns}
\end{frame}
% --- Slide R.5: How to Think About Accuracy ---
\begin{frame}{How to Think About Accuracy: The CPI Analogy}
\begin{block}{Patterson's Insight (Computer Architecture)}
\small
Hennessy \& Patterson did not predict CPI from first principles.\\
They \textbf{measured} CPI empirically, then used it to \textbf{reason} about
architectural tradeoffs. The value was in the \textbf{framework}, not the exact number.
\end{block}
\vspace{0.5em}
\begin{block}{Our Approach (ML Systems)}
\small
We do not predict $\eta$ (efficiency) from first principles.\\
We \textbf{measure} $\eta$ empirically (MFU from published papers), then use it
to \textbf{reason} about system tradeoffs across 22 walls.
\end{block}
\vspace{0.8em}
\centering
\begin{tabular}{@{}lll@{}}
\toprule
& \textbf{Computer Architecture} & \textbf{ML Systems} \\
\midrule
Empirical constant & CPI & $\eta$ (MFU) \\
Framework & Iron Law of CPU Perf & Iron Law of ML Systems \\
Value & Reason about ISA tradeoffs & Reason about system walls \\
\bottomrule
\end{tabular}
\end{frame}
% --- Slide R.6: The Iron Law --- Our Contribution ---
\begin{frame}{The Iron Law: Every Wall Maps to a Term}
\begin{center}
\large
\[
T = \frac{\text{FLOPs}}{N \;\times\; \text{Peak} \;\times\; \text{MFU} \;\times\; \eta_{\text{scaling}} \;\times\; \text{Goodput}}
\]
\end{center}
\vspace{0.3em}
\small
\begin{columns}[T]
\column{0.48\textwidth}
\begin{tabular}{@{}ll@{}}
\toprule
\textbf{Term} & \textbf{Walls} \\
\midrule
FLOPs (numerator) & 11 Complexity \\
& 12 Reasoning \\
& 13 Fidelity \\
\midrule
$N$ (parallelism) & 14 Communication \\
& 15 Fragility \\
& 16 Multi-tenant \\
\bottomrule
\end{tabular}
\column{0.48\textwidth}
\begin{tabular}{@{}ll@{}}
\toprule
\textbf{Term} & \textbf{Walls} \\
\midrule
Peak (hardware) & 1 Compute \\
& 2 Memory \\
& 6 Streaming \\
\midrule
MFU (efficiency) & 3 Software \\
& 4 Serving \\
& 5 Batching \\
\midrule
Goodput & 8 Ingestion \\
& 9 Transformation \\
& 19 Checkpoint \\
\bottomrule
\end{tabular}
\end{columns}
\vspace{0.5em}
\centering
\footnotesize
Remaining walls: 7 Tail Latency, 10 Locality, 17 Capital, 18 Sustainability, 20 Safety, 21 Sensitivity, 22 Synthesis\\
These are \textbf{cross-cutting constraints} and \textbf{meta-analysis tools} that modulate the Iron Law terms.
\end{frame}
% --- Slide R.7: 22 Walls at a Glance ---
\begin{frame}{22 Walls at a Glance}
\centering
\scriptsize
\setlength{\tabcolsep}{3pt}
\begin{tabular}{@{}rllll@{}}
\toprule
\textbf{\#} & \textbf{Wall} & \textbf{Domain} & \textbf{Constraint} & \textbf{Key Equation} \\
\midrule
1 & Compute & Node & Peak FLOPS ceiling & $T = \text{OPs} / (\text{Peak} \times \eta)$ \\
2 & Memory & Node & HBM capacity \& bandwidth & $T = |W| / \text{BW}$ \\
3 & Software & Node & Peak vs achieved FLOPS & $\eta_{\text{MFU}}$ \\
4 & Serving & Node & Prefill vs decode regimes & TTFT, ITL \\
5 & Batching & Node & KV cache fragmentation & PagedAttention \\
6 & Streaming & Node & Injection BW (wafer-scale) & $T = |W| / \text{BW}_{\text{inj}}$ \\
7 & Tail Latency & Node & P99 grows near saturation & Erlang-C \\
\midrule
8 & Ingestion & Data & Storage I/O supply rate & $\rho = \text{BW}_d / \text{BW}_s$ \\
9 & Transform & Data & CPU preprocessing rate & $T = BS / C$ \\
10 & Locality & Data & Bisection bandwidth limit & $\text{BW}_{\text{eff}}$ \\
\midrule
11 & Complexity & Algorithm & Chinchilla scaling laws & $C = 6PD$ \\
12 & Reasoning & Algorithm & Inference-time compute & $T = K \times T_{\text{step}}$ \\
13 & Fidelity & Algorithm & Compression--accuracy tradeoff & $r = 32/b$ \\
\midrule
14 & Communication & Fleet & AllReduce synchronization & Ring AllReduce \\
15 & Fragility & Fleet & Component failures at scale & $\text{MTBF}/N$ \\
16 & Multi-tenant & Fleet & Queueing delays in shared clusters & $\rho / [2\mu(1{-}\rho)]$ \\
\midrule
17 & Capital & Ops & Total cost of ownership & TCO = CapEx + OpEx \\
18 & Sustainability & Ops & Carbon \& water footprint & $\text{CO}_2 = E \times \text{PUE} \times \text{CI}$ \\
19 & Checkpoint & Ops & I/O burst penalty on MFU & $T_w / T_i$ \\
20 & Safety & Ops & Privacy/fairness overhead & DP-SGD $\sigma \propto 1/\varepsilon$ \\
\midrule
21 & Sensitivity & Analysis & Binding constraint ID & $\partial T / \partial x_i$ \\
22 & Synthesis & Analysis & Inverse Roofline (HW from SLA) & $\text{BW} = |W| / T$ \\
\bottomrule
\end{tabular}
\end{frame}
% --- Slide R.8: Hardware Zoo ---
\begin{frame}{The Hardware Zoo: Every Platform in mlsysim}
\note{[2 min] IMPORTANT FOR ISCA. Linger on this slide. Point out that this
is NOT an NVIDIA tool. Every vendor represented. Custom silicon can be
added with a 10-line YAML spec.
% -- FLEX: [CORE]
}
\centering
\scriptsize
\setlength{\tabcolsep}{3pt}
\begin{tabular}{@{}llll@{}}
\toprule
\textbf{Vendor} & \textbf{Cloud / Training} & \textbf{Edge / Inference} & \textbf{TinyML} \\
\midrule
\textbf{NVIDIA} & A100, H100, H200, B200 & Jetson Orin NX, Orin Nano & --- \\
\textbf{AMD} & MI250X, MI300X & --- & --- \\
\textbf{Intel} & Gaudi\,2, Gaudi\,3 & --- & --- \\
\textbf{Google} & TPU v4, TPU v5e & Coral Edge TPU & --- \\
\textbf{AWS} & Trainium2 & Inferentia2 & --- \\
\textbf{Cerebras} & CS-2 (wafer-scale) & --- & --- \\
\textbf{MCU} & --- & --- & ESP32-S3, nRF52840 \\
\textbf{Custom} & \multicolumn{3}{l}{Any accelerator via \texttt{mlsysim.Hardware.from\_spec(\{...\})}} \\
\bottomrule
\end{tabular}
\vspace{0.5em}
\small
\textbf{5 vendors, 20+ platforms, TinyML to Frontier.}\\[0.2em]
\alert{mlsysim is a multi-vendor analytical engine, not an NVIDIA wrapper.}
\end{frame}
% --- Slide R.9: Try It Now ---
\begin{frame}[fragile]{Try It Now}
\begin{center}
\Large\bfseries
One install. No GPU. Sub-second answers.
\end{center}
\vspace{1em}
\begin{lstlisting}[language=bash, backgroundcolor=\color{crimson!6}]
pip install mlsysim
\end{lstlisting}
\vspace{0.5em}
\begin{lstlisting}
import mlsysim
# Profile Llama-3 8B on H100 in one line
profile = mlsysim.Engine.solve(
mlsysim.Models.Language.Llama3_8B,
mlsysim.Hardware.H100,
batch_size=1, seq_len=2048
)
print(profile.summary())
# => bottleneck, latency, throughput, MFU, feasibility
\end{lstlisting}
\vspace{0.5em}
\centering
\small
\begin{tabular}{@{}ll@{}}
GitHub: & \texttt{github.com/harvard-edge/mlsysim} \\
Docs: & \texttt{mlsysim.readthedocs.io} \\
Paper: & \texttt{arxiv.org/abs/XXXX.XXXXX} \\
\end{tabular}
\end{frame}