% ============================================================================= % MLSys-im ISCA 2026 Tutorial — Related Work & Positioning (~8 slides) % ============================================================================= % For ISCA distribution: self-contained Metropolis theme. % For internal use: swap to beamerthememlsys.sty % ============================================================================= % This file is designed to be \input{related_work} into tutorial_part1.tex % or tutorial_part2.tex. It contains no preamble or document environment. % ============================================================================= % ===================================================================== % RELATED WORK & POSITIONING % ===================================================================== \section{Related Work} % --- Slide R.1: The Landscape --- \begin{frame}{The Landscape: Existing ML Performance Tools} \centering \scriptsize \begin{tabular}{@{}lllllll@{}} \toprule \textbf{Tool} & \textbf{Type} & \textbf{Speed} & \textbf{Scope} & \textbf{HW Coverage} & \textbf{Access} \\ \midrule ASTRA-sim & Discrete-event & Hours & Network & NVIDIA & Open \\ Calculon & Analytical & Seconds & Training & NVIDIA & Open \\ DeepSpeed Profiler & Runtime & Real-time & Single-node & NVIDIA & Requires GPU \\ LLMPerf / vLLM & Empirical & Minutes & Serving & NVIDIA, AMD & Requires GPU \\ Roofline Toolkit & Empirical & Minutes & Compute/memory & Intel, NVIDIA & Requires HW \\ Internal (Google, Meta) & Analytical & Seconds & Full-stack & Custom & Proprietary \\ \rowcolor{crimson!12} \textbf{mlsysim} & \textbf{Analytical} & \textbf{Sub-second} & \textbf{22 walls} & \textbf{5 vendors} & \textbf{Open, no GPU} \\ \bottomrule \end{tabular} \vspace{0.8em} \small \textbf{Key observations:} (1) Open tools cover 3--4 constraints each. (2) Most are NVIDIA-only. mlsysim covers \textbf{NVIDIA, AMD, Intel, Google, Cerebras,} and custom silicon---full stack from compute to carbon. \end{frame} % --- Slide R.2: What Makes mlsysim Different? --- \begin{frame}{What Makes mlsysim Different?} \begin{columns}[T] \column{0.55\textwidth} \begin{enumerate} \item \textbf{Breadth: 22 walls in one pipeline}\\ \small Others cover 3--4 constraints.\\ mlsysim composes all 22 into a single\\ \texttt{Pipeline.solve()} call. \vspace{0.6em} \item \textbf{Unit safety: Pint dimensional analysis}\\ \small Every quantity carries its unit.\\ \texttt{GB + TFLOPS} $\Rightarrow$ \textcolor{crimson}{DimensionalityError}.\\ Catches errors spreadsheets never will. \vspace{0.6em} \item \textbf{Traceability: Every constant is cited}\\ \small \texttt{TraceableConstant} records source,\\ date, and DOI for every hardware spec.\\ Audit any number back to its origin. \end{enumerate} \column{0.42\textwidth} \centering \begin{block}{The Trifecta} \small \begin{tabular}{@{}lc@{}} Feature & Others \\ \midrule 22 walls & 3--4 \\ Unit safety & None \\ Traceability & Rare \\ \end{tabular} \end{block} \vspace{1em} \begin{exampleblock}{One-Liner} \footnotesize \texttt{pip install mlsysim}\\[2pt] No GPU. No cluster.\\ Just Python 3.10+. \end{exampleblock} \end{columns} \end{frame} % --- Slide R.3: The Fidelity-Speed Spectrum --- \begin{frame}{The Fidelity--Speed Spectrum} \centering \begin{tikzpicture}[ every node/.style={font=\small}, arrow/.style={-{Stealth[length=3mm]}, thick, color=midgray}, tool/.style={draw, rounded corners=3pt, fill=computeblue, text=darktext, minimum height=0.7cm, minimum width=2.2cm, font=\footnotesize\bfseries}, highlight/.style={tool, fill=crimson!15, draw=crimson, thick}, ] % Axis \draw[arrow] (0,0) -- (12.5,0) node[right] {\textbf{Speed}}; \draw[arrow] (0,0) -- (0,4.5) node[above] {\textbf{Fidelity}}; % Labels on axis \node[font=\scriptsize, color=midgray] at (1.5,-0.4) {Hours}; \node[font=\scriptsize, color=midgray] at (5.0,-0.4) {Minutes}; \node[font=\scriptsize, color=midgray] at (8.5,-0.4) {Seconds}; \node[font=\scriptsize, color=midgray] at (11.5,-0.4) {Instant}; % Tools \node[tool] (cycle) at (1.5, 3.8) {Cycle-accurate}; \node[tool] (astra) at (3.2, 3.0) {ASTRA-sim}; \node[tool] (llmperf) at (4.5, 2.2) {LLMPerf}; \node[tool] (calculon) at (7.0, 2.5) {Calculon}; \node[highlight] (mlsysim) at (9.0, 2.0) {mlsysim}; \node[tool] (napkin) at (11.5, 0.8) {Napkin math}; % Annotation for mlsysim \node[font=\scriptsize, color=crimson, align=center] at (9.0, 1.0) {22 walls, sub-second\\composable pipeline}; % Sweet-spot region \draw[dashed, crimson, thick, rounded corners=6pt] (7.5, 0.5) rectangle (10.5, 2.8); \node[font=\scriptsize\itshape, color=crimson] at (9.0, 3.1) {``Fast analytical'' sweet spot}; \end{tikzpicture} \vspace{0.3em} \small\itshape mlsysim trades cycle-accuracy for speed and breadth.\\ The right tool depends on the question you are asking. \end{frame} % --- Slide R.4: What mlsysim Does NOT Do --- \begin{frame}{What mlsysim Does NOT Do} \begin{columns}[T] \column{0.52\textwidth} \textbf{\textcolor{crimson}{Not a replacement for:}} \begin{itemize} \item \textbf{Benchmarking} --- real measurements on real hardware \item \textbf{Cycle-accurate simulation} --- microarchitectural detail \item \textbf{Production deployment} --- no orchestration, no serving \item \textbf{Framework profiling} --- no kernel-level tracing \end{itemize} \vspace{0.5em} \textbf{Prediction accuracy:}\\ Within \alert{2--5$\times$} of measured performance.\\ Useful for \emph{ranking} and \emph{bounding},\\ not for exact latency guarantees. \column{0.45\textwidth} \textbf{\textcolor{datastroke}{What it IS:}} \begin{itemize} \item A \textbf{reasoning framework} \item Answers ``which constraint binds?'' \item Answers ``is this hardware sufficient?'' \item Answers ``what should I try first?'' \item Runs in $<$1\,ms per evaluation \item No GPU required \end{itemize} \vspace{0.5em} \begin{center} \small\itshape ``All models are wrong,\\ but some are useful.''\\ --- George Box \end{center} \end{columns} \end{frame} % --- Slide R.5: How to Think About Accuracy --- \begin{frame}{How to Think About Accuracy: The CPI Analogy} \begin{block}{Patterson's Insight (Computer Architecture)} \small Hennessy \& Patterson did not predict CPI from first principles.\\ They \textbf{measured} CPI empirically, then used it to \textbf{reason} about architectural tradeoffs. The value was in the \textbf{framework}, not the exact number. \end{block} \vspace{0.5em} \begin{block}{Our Approach (ML Systems)} \small We do not predict $\eta$ (efficiency) from first principles.\\ We \textbf{measure} $\eta$ empirically (MFU from published papers), then use it to \textbf{reason} about system tradeoffs across 22 walls. \end{block} \vspace{0.8em} \centering \begin{tabular}{@{}lll@{}} \toprule & \textbf{Computer Architecture} & \textbf{ML Systems} \\ \midrule Empirical constant & CPI & $\eta$ (MFU) \\ Framework & Iron Law of CPU Perf & Iron Law of ML Systems \\ Value & Reason about ISA tradeoffs & Reason about system walls \\ \bottomrule \end{tabular} \end{frame} % --- Slide R.6: The Iron Law --- Our Contribution --- \begin{frame}{The Iron Law: Every Wall Maps to a Term} \begin{center} \large \[ T = \frac{\text{FLOPs}}{N \;\times\; \text{Peak} \;\times\; \text{MFU} \;\times\; \eta_{\text{scaling}} \;\times\; \text{Goodput}} \] \end{center} \vspace{0.3em} \small \begin{columns}[T] \column{0.48\textwidth} \begin{tabular}{@{}ll@{}} \toprule \textbf{Term} & \textbf{Walls} \\ \midrule FLOPs (numerator) & 11 Complexity \\ & 12 Reasoning \\ & 13 Fidelity \\ \midrule $N$ (parallelism) & 14 Communication \\ & 15 Fragility \\ & 16 Multi-tenant \\ \bottomrule \end{tabular} \column{0.48\textwidth} \begin{tabular}{@{}ll@{}} \toprule \textbf{Term} & \textbf{Walls} \\ \midrule Peak (hardware) & 1 Compute \\ & 2 Memory \\ & 6 Streaming \\ \midrule MFU (efficiency) & 3 Software \\ & 4 Serving \\ & 5 Batching \\ \midrule Goodput & 8 Ingestion \\ & 9 Transformation \\ & 19 Checkpoint \\ \bottomrule \end{tabular} \end{columns} \vspace{0.5em} \centering \footnotesize Remaining walls: 7 Tail Latency, 10 Locality, 17 Capital, 18 Sustainability, 20 Safety, 21 Sensitivity, 22 Synthesis\\ These are \textbf{cross-cutting constraints} and \textbf{meta-analysis tools} that modulate the Iron Law terms. \end{frame} % --- Slide R.7: 22 Walls at a Glance --- \begin{frame}{22 Walls at a Glance} \centering \scriptsize \setlength{\tabcolsep}{3pt} \begin{tabular}{@{}rllll@{}} \toprule \textbf{\#} & \textbf{Wall} & \textbf{Domain} & \textbf{Constraint} & \textbf{Key Equation} \\ \midrule 1 & Compute & Node & Peak FLOPS ceiling & $T = \text{OPs} / (\text{Peak} \times \eta)$ \\ 2 & Memory & Node & HBM capacity \& bandwidth & $T = |W| / \text{BW}$ \\ 3 & Software & Node & Peak vs achieved FLOPS & $\eta_{\text{MFU}}$ \\ 4 & Serving & Node & Prefill vs decode regimes & TTFT, ITL \\ 5 & Batching & Node & KV cache fragmentation & PagedAttention \\ 6 & Streaming & Node & Injection BW (wafer-scale) & $T = |W| / \text{BW}_{\text{inj}}$ \\ 7 & Tail Latency & Node & P99 grows near saturation & Erlang-C \\ \midrule 8 & Ingestion & Data & Storage I/O supply rate & $\rho = \text{BW}_d / \text{BW}_s$ \\ 9 & Transform & Data & CPU preprocessing rate & $T = BS / C$ \\ 10 & Locality & Data & Bisection bandwidth limit & $\text{BW}_{\text{eff}}$ \\ \midrule 11 & Complexity & Algorithm & Chinchilla scaling laws & $C = 6PD$ \\ 12 & Reasoning & Algorithm & Inference-time compute & $T = K \times T_{\text{step}}$ \\ 13 & Fidelity & Algorithm & Compression--accuracy tradeoff & $r = 32/b$ \\ \midrule 14 & Communication & Fleet & AllReduce synchronization & Ring AllReduce \\ 15 & Fragility & Fleet & Component failures at scale & $\text{MTBF}/N$ \\ 16 & Multi-tenant & Fleet & Queueing delays in shared clusters & $\rho / [2\mu(1{-}\rho)]$ \\ \midrule 17 & Capital & Ops & Total cost of ownership & TCO = CapEx + OpEx \\ 18 & Sustainability & Ops & Carbon \& water footprint & $\text{CO}_2 = E \times \text{PUE} \times \text{CI}$ \\ 19 & Checkpoint & Ops & I/O burst penalty on MFU & $T_w / T_i$ \\ 20 & Safety & Ops & Privacy/fairness overhead & DP-SGD $\sigma \propto 1/\varepsilon$ \\ \midrule 21 & Sensitivity & Analysis & Binding constraint ID & $\partial T / \partial x_i$ \\ 22 & Synthesis & Analysis & Inverse Roofline (HW from SLA) & $\text{BW} = |W| / T$ \\ \bottomrule \end{tabular} \end{frame} % --- Slide R.8: Hardware Zoo --- \begin{frame}{The Hardware Zoo: Every Platform in mlsysim} \note{[2 min] IMPORTANT FOR ISCA. Linger on this slide. Point out that this is NOT an NVIDIA tool. Every vendor represented. Custom silicon can be added with a 10-line YAML spec. % -- FLEX: [CORE] } \centering \scriptsize \setlength{\tabcolsep}{3pt} \begin{tabular}{@{}llll@{}} \toprule \textbf{Vendor} & \textbf{Cloud / Training} & \textbf{Edge / Inference} & \textbf{TinyML} \\ \midrule \textbf{NVIDIA} & A100, H100, H200, B200 & Jetson Orin NX, Orin Nano & --- \\ \textbf{AMD} & MI250X, MI300X & --- & --- \\ \textbf{Intel} & Gaudi\,2, Gaudi\,3 & --- & --- \\ \textbf{Google} & TPU v4, TPU v5e & Coral Edge TPU & --- \\ \textbf{AWS} & Trainium2 & Inferentia2 & --- \\ \textbf{Cerebras} & CS-2 (wafer-scale) & --- & --- \\ \textbf{MCU} & --- & --- & ESP32-S3, nRF52840 \\ \textbf{Custom} & \multicolumn{3}{l}{Any accelerator via \texttt{mlsysim.Hardware.from\_spec(\{...\})}} \\ \bottomrule \end{tabular} \vspace{0.5em} \small \textbf{5 vendors, 20+ platforms, TinyML to Frontier.}\\[0.2em] \alert{mlsysim is a multi-vendor analytical engine, not an NVIDIA wrapper.} \end{frame} % --- Slide R.9: Try It Now --- \begin{frame}[fragile]{Try It Now} \begin{center} \Large\bfseries One install. No GPU. Sub-second answers. \end{center} \vspace{1em} \begin{lstlisting}[language=bash, backgroundcolor=\color{crimson!6}] pip install mlsysim \end{lstlisting} \vspace{0.5em} \begin{lstlisting} import mlsysim # Profile Llama-3 8B on H100 in one line profile = mlsysim.Engine.solve( mlsysim.Models.Language.Llama3_8B, mlsysim.Hardware.H100, batch_size=1, seq_len=2048 ) print(profile.summary()) # => bottleneck, latency, throughput, MFU, feasibility \end{lstlisting} \vspace{0.5em} \centering \small \begin{tabular}{@{}ll@{}} GitHub: & \texttt{github.com/harvard-edge/mlsysim} \\ Docs: & \texttt{mlsysim.readthedocs.io} \\ Paper: & \texttt{arxiv.org/abs/XXXX.XXXXX} \\ \end{tabular} \end{frame}