cs249r_book/mlsysim/tutorial/slides/related_work.tex

% =============================================================================
% MLSys-im ISCA 2026 Tutorial — Related Work & Positioning (~8 slides)
% =============================================================================
% For ISCA distribution: self-contained Metropolis theme.
% For internal use: swap to beamerthememlsys.sty
% =============================================================================
% This file is designed to be \input{related_work} into tutorial_part1.tex
% or tutorial_part2.tex. It contains no preamble or document environment.
% =============================================================================

% =====================================================================
% RELATED WORK & POSITIONING
% =====================================================================
\section{Related Work}

% --- Slide R.1: The Landscape ---
\begin{frame}{The Landscape: Existing ML Performance Tools}
\centering
\scriptsize
\begin{tabular}{@{}lllllll@{}}
\toprule
\textbf{Tool} & \textbf{Type} & \textbf{Speed} & \textbf{Scope} & \textbf{HW Coverage} & \textbf{Access} \\
\midrule
ASTRA-sim        & Discrete-event   & Hours      & Network              & NVIDIA          & Open \\
Calculon         & Analytical       & Seconds    & Training             & NVIDIA          & Open \\
DeepSpeed Profiler & Runtime        & Real-time  & Single-node          & NVIDIA          & Requires GPU \\
LLMPerf / vLLM   & Empirical        & Minutes    & Serving              & NVIDIA, AMD     & Requires GPU \\
Roofline Toolkit  & Empirical        & Minutes    & Compute/memory       & Intel, NVIDIA   & Requires HW \\
Internal (Google, Meta) & Analytical & Seconds    & Full-stack           & Custom          & Proprietary \\
\rowcolor{crimson!12}
\textbf{mlsysim} & \textbf{Analytical} & \textbf{Sub-second} & \textbf{22 walls} & \textbf{5 vendors} & \textbf{Open, no GPU} \\
\bottomrule
\end{tabular}

\vspace{0.8em}
\small
\textbf{Key observations:} (1) Open tools cover 3--4 constraints each.
(2) Most are NVIDIA-only. mlsysim covers \textbf{NVIDIA, AMD, Intel, Google, Cerebras,}
and custom silicon---full stack from compute to carbon.
\end{frame}

% --- Slide R.2: What Makes mlsysim Different? ---
\begin{frame}{What Makes mlsysim Different?}
\begin{columns}[T]
\column{0.55\textwidth}
\begin{enumerate}
  \item \textbf{Breadth: 22 walls in one pipeline}\\
    \small Others cover 3--4 constraints.\\
    mlsysim composes all 22 into a single\\
    \texttt{Pipeline.solve()} call.

  \vspace{0.6em}
  \item \textbf{Unit safety: Pint dimensional analysis}\\
    \small Every quantity carries its unit.\\
    \texttt{GB + TFLOPS} $\Rightarrow$ \textcolor{crimson}{DimensionalityError}.\\
    Catches errors spreadsheets never will.

  \vspace{0.6em}
  \item \textbf{Traceability: Every constant is cited}\\
    \small \texttt{TraceableConstant} records source,\\
    date, and DOI for every hardware spec.\\
    Audit any number back to its origin.
\end{enumerate}

\column{0.42\textwidth}
\centering
\begin{block}{The Trifecta}
\small
\begin{tabular}{@{}lc@{}}
Feature & Others \\
\midrule
22 walls     & 3--4 \\
Unit safety  & None \\
Traceability & Rare \\
\end{tabular}
\end{block}

\vspace{1em}
\begin{exampleblock}{One-Liner}
\footnotesize
\texttt{pip install mlsysim}\\[2pt]
No GPU. No cluster.\\
Just Python 3.10+.
\end{exampleblock}
\end{columns}
\end{frame}

% --- Slide R.3: The Fidelity-Speed Spectrum ---
\begin{frame}{The Fidelity--Speed Spectrum}
\centering
\begin{tikzpicture}[
  every node/.style={font=\small},
  arrow/.style={-{Stealth[length=3mm]}, thick, color=midgray},
  tool/.style={draw, rounded corners=3pt, fill=computeblue, text=darktext,
               minimum height=0.7cm, minimum width=2.2cm, font=\footnotesize\bfseries},
  highlight/.style={tool, fill=crimson!15, draw=crimson, thick},
]
  % Axis
  \draw[arrow] (0,0) -- (12.5,0) node[right] {\textbf{Speed}};
  \draw[arrow] (0,0) -- (0,4.5)  node[above] {\textbf{Fidelity}};

  % Labels on axis
  \node[font=\scriptsize, color=midgray] at (1.5,-0.4) {Hours};
  \node[font=\scriptsize, color=midgray] at (5.0,-0.4) {Minutes};
  \node[font=\scriptsize, color=midgray] at (8.5,-0.4) {Seconds};
  \node[font=\scriptsize, color=midgray] at (11.5,-0.4) {Instant};

  % Tools
  \node[tool] (cycle) at (1.5, 3.8) {Cycle-accurate};
  \node[tool] (astra) at (3.2, 3.0) {ASTRA-sim};
  \node[tool] (llmperf) at (4.5, 2.2) {LLMPerf};
  \node[tool] (calculon) at (7.0, 2.5) {Calculon};
  \node[highlight] (mlsysim) at (9.0, 2.0) {mlsysim};
  \node[tool] (napkin) at (11.5, 0.8) {Napkin math};

  % Annotation for mlsysim
  \node[font=\scriptsize, color=crimson, align=center] at (9.0, 1.0)
    {22 walls, sub-second\\composable pipeline};

  % Sweet-spot region
  \draw[dashed, crimson, thick, rounded corners=6pt]
    (7.5, 0.5) rectangle (10.5, 2.8);
  \node[font=\scriptsize\itshape, color=crimson] at (9.0, 3.1)
    {``Fast analytical'' sweet spot};
\end{tikzpicture}

\vspace{0.3em}
\small\itshape
mlsysim trades cycle-accuracy for speed and breadth.\\
The right tool depends on the question you are asking.
\end{frame}

% --- Slide R.4: What mlsysim Does NOT Do ---
\begin{frame}{What mlsysim Does NOT Do}
\begin{columns}[T]
\column{0.52\textwidth}
\textbf{\textcolor{crimson}{Not a replacement for:}}
\begin{itemize}
  \item \textbf{Benchmarking} --- real measurements on real hardware
  \item \textbf{Cycle-accurate simulation} --- microarchitectural detail
  \item \textbf{Production deployment} --- no orchestration, no serving
  \item \textbf{Framework profiling} --- no kernel-level tracing
\end{itemize}

\vspace{0.5em}
\textbf{Prediction accuracy:}\\
Within \alert{2--5$\times$} of measured performance.\\
Useful for \emph{ranking} and \emph{bounding},\\
not for exact latency guarantees.

\column{0.45\textwidth}
\textbf{\textcolor{datastroke}{What it IS:}}
\begin{itemize}
  \item A \textbf{reasoning framework}
  \item Answers ``which constraint binds?''
  \item Answers ``is this hardware sufficient?''
  \item Answers ``what should I try first?''
  \item Runs in $<$1\,ms per evaluation
  \item No GPU required
\end{itemize}

\vspace{0.5em}
\begin{center}
\small\itshape
``All models are wrong,\\
but some are useful.''\\
--- George Box
\end{center}
\end{columns}
\end{frame}

% --- Slide R.5: How to Think About Accuracy ---
\begin{frame}{How to Think About Accuracy: The CPI Analogy}
\begin{block}{Patterson's Insight (Computer Architecture)}
\small
Hennessy \& Patterson did not predict CPI from first principles.\\
They \textbf{measured} CPI empirically, then used it to \textbf{reason} about
architectural tradeoffs. The value was in the \textbf{framework}, not the exact number.
\end{block}

\vspace{0.5em}
\begin{block}{Our Approach (ML Systems)}
\small
We do not predict $\eta$ (efficiency) from first principles.\\
We \textbf{measure} $\eta$ empirically (MFU from published papers), then use it
to \textbf{reason} about system tradeoffs across 22 walls.
\end{block}

\vspace{0.8em}
\centering
\begin{tabular}{@{}lll@{}}
\toprule
& \textbf{Computer Architecture} & \textbf{ML Systems} \\
\midrule
Empirical constant & CPI & $\eta$ (MFU) \\
Framework & Iron Law of CPU Perf & Iron Law of ML Systems \\
Value & Reason about ISA tradeoffs & Reason about system walls \\
\bottomrule
\end{tabular}
\end{frame}

% --- Slide R.6: The Iron Law --- Our Contribution ---
\begin{frame}{The Iron Law: Every Wall Maps to a Term}
\begin{center}
\large
\[
T = \frac{\text{FLOPs}}{N \;\times\; \text{Peak} \;\times\; \text{MFU} \;\times\; \eta_{\text{scaling}} \;\times\; \text{Goodput}}
\]
\end{center}

\vspace{0.3em}
\small
\begin{columns}[T]
\column{0.48\textwidth}
\begin{tabular}{@{}ll@{}}
\toprule
\textbf{Term} & \textbf{Walls} \\
\midrule
FLOPs (numerator) & 11 Complexity \\
                   & 12 Reasoning \\
                   & 13 Fidelity \\
\midrule
$N$ (parallelism)  & 14 Communication \\
                   & 15 Fragility \\
                   & 16 Multi-tenant \\
\bottomrule
\end{tabular}

\column{0.48\textwidth}
\begin{tabular}{@{}ll@{}}
\toprule
\textbf{Term} & \textbf{Walls} \\
\midrule
Peak (hardware) & 1 Compute \\
                & 2 Memory \\
                & 6 Streaming \\
\midrule
MFU (efficiency) & 3 Software \\
                 & 4 Serving \\
                 & 5 Batching \\
\midrule
Goodput          & 8 Ingestion \\
                 & 9 Transformation \\
                 & 19 Checkpoint \\
\bottomrule
\end{tabular}
\end{columns}

\vspace{0.5em}
\centering
\footnotesize
Remaining walls: 7 Tail Latency, 10 Locality, 17 Capital, 18 Sustainability, 20 Safety, 21 Sensitivity, 22 Synthesis\\
These are \textbf{cross-cutting constraints} and \textbf{meta-analysis tools} that modulate the Iron Law terms.
\end{frame}

% --- Slide R.7: 22 Walls at a Glance ---
\begin{frame}{22 Walls at a Glance}
\centering
\scriptsize
\setlength{\tabcolsep}{3pt}
\begin{tabular}{@{}rllll@{}}
\toprule
\textbf{\#} & \textbf{Wall} & \textbf{Domain} & \textbf{Constraint} & \textbf{Key Equation} \\
\midrule
1  & Compute       & Node      & Peak FLOPS ceiling            & $T = \text{OPs} / (\text{Peak} \times \eta)$ \\
2  & Memory        & Node      & HBM capacity \& bandwidth     & $T = |W| / \text{BW}$ \\
3  & Software      & Node      & Peak vs achieved FLOPS        & $\eta_{\text{MFU}}$ \\
4  & Serving       & Node      & Prefill vs decode regimes     & TTFT, ITL \\
5  & Batching      & Node      & KV cache fragmentation        & PagedAttention \\
6  & Streaming     & Node      & Injection BW (wafer-scale)    & $T = |W| / \text{BW}_{\text{inj}}$ \\
7  & Tail Latency  & Node      & P99 grows near saturation     & Erlang-C \\
\midrule
8  & Ingestion     & Data      & Storage I/O supply rate       & $\rho = \text{BW}_d / \text{BW}_s$ \\
9  & Transform     & Data      & CPU preprocessing rate        & $T = BS / C$ \\
10 & Locality      & Data      & Bisection bandwidth limit     & $\text{BW}_{\text{eff}}$ \\
\midrule
11 & Complexity    & Algorithm & Chinchilla scaling laws       & $C = 6PD$ \\
12 & Reasoning     & Algorithm & Inference-time compute        & $T = K \times T_{\text{step}}$ \\
13 & Fidelity      & Algorithm & Compression--accuracy tradeoff & $r = 32/b$ \\
\midrule
14 & Communication & Fleet     & AllReduce synchronization     & Ring AllReduce \\
15 & Fragility     & Fleet     & Component failures at scale   & $\text{MTBF}/N$ \\
16 & Multi-tenant  & Fleet     & Queueing delays in shared clusters & $\rho / [2\mu(1{-}\rho)]$ \\
\midrule
17 & Capital       & Ops       & Total cost of ownership       & TCO = CapEx + OpEx \\
18 & Sustainability & Ops      & Carbon \& water footprint     & $\text{CO}_2 = E \times \text{PUE} \times \text{CI}$ \\
19 & Checkpoint    & Ops       & I/O burst penalty on MFU      & $T_w / T_i$ \\
20 & Safety        & Ops       & Privacy/fairness overhead     & DP-SGD $\sigma \propto 1/\varepsilon$ \\
\midrule
21 & Sensitivity   & Analysis  & Binding constraint ID         & $\partial T / \partial x_i$ \\
22 & Synthesis     & Analysis  & Inverse Roofline (HW from SLA) & $\text{BW} = |W| / T$ \\
\bottomrule
\end{tabular}
\end{frame}

% --- Slide R.8: Hardware Zoo ---
\begin{frame}{The Hardware Zoo: Every Platform in mlsysim}
\note{[2 min] IMPORTANT FOR ISCA. Linger on this slide. Point out that this
is NOT an NVIDIA tool. Every vendor represented. Custom silicon can be
added with a 10-line YAML spec.
% -- FLEX: [CORE]
}

\centering
\scriptsize
\setlength{\tabcolsep}{3pt}
\begin{tabular}{@{}llll@{}}
\toprule
\textbf{Vendor} & \textbf{Cloud / Training} & \textbf{Edge / Inference} & \textbf{TinyML} \\
\midrule
\textbf{NVIDIA}   & A100, H100, H200, B200        & Jetson Orin NX, Orin Nano & --- \\
\textbf{AMD}      & MI250X, MI300X                 & ---                       & --- \\
\textbf{Intel}    & Gaudi\,2, Gaudi\,3             & ---                       & --- \\
\textbf{Google}   & TPU v4, TPU v5e                & Coral Edge TPU            & --- \\
\textbf{AWS}      & Trainium2                      & Inferentia2               & --- \\
\textbf{Cerebras} & CS-2 (wafer-scale)             & ---                       & --- \\
\textbf{MCU}      & ---                            & ---                       & ESP32-S3, nRF52840 \\
\textbf{Custom}   & \multicolumn{3}{l}{Any accelerator via \texttt{mlsysim.Hardware.from\_spec(\{...\})}} \\
\bottomrule
\end{tabular}

\vspace{0.5em}
\small
\textbf{5 vendors, 20+ platforms, TinyML to Frontier.}\\[0.2em]
\alert{mlsysim is a multi-vendor analytical engine, not an NVIDIA wrapper.}
\end{frame}

% --- Slide R.9: Try It Now ---
\begin{frame}[fragile]{Try It Now}
\begin{center}
\Large\bfseries
One install. No GPU. Sub-second answers.
\end{center}

\vspace{1em}
\begin{lstlisting}[language=bash, backgroundcolor=\color{crimson!6}]
pip install mlsysim
\end{lstlisting}

\vspace{0.5em}
\begin{lstlisting}
import mlsysim

# Profile Llama-3 8B on H100 in one line
profile = mlsysim.Engine.solve(
    mlsysim.Models.Language.Llama3_8B,
    mlsysim.Hardware.H100,
    batch_size=1, seq_len=2048
)
print(profile.summary())
# => bottleneck, latency, throughput, MFU, feasibility
\end{lstlisting}

\vspace{0.5em}
\centering
\small
\begin{tabular}{@{}ll@{}}
GitHub:  & \texttt{github.com/harvard-edge/mlsysim} \\
Docs:    & \texttt{mlsysim.readthedocs.io} \\
Paper:   & \texttt{arxiv.org/abs/XXXX.XXXXX} \\
\end{tabular}
\end{frame}