Files
cs249r_book/slides/mlsysim_tutorial.tex
Vijay Janapa Reddi 1eb30f5f86 fix(mlsysim): harden release QA and paper artifacts
Align the MLSys·im code, docs, paper, website, workflows, and lab wheel for the 0.1.1 release. This also fixes runtime/API issues found during release review and prepares the paper PDF plus archive package.
2026-04-25 10:06:01 -04:00

119 lines
4.7 KiB
TeX

\documentclass[aspectratio=169, 10pt]{beamer}
% Theme and styling to match MLSysBook aesthetic
\usetheme{metropolis}
\definecolor{HarvardCrimson}{RGB}{165, 28, 48}
\definecolor{DarkGray}{RGB}{51, 51, 51}
\setbeamercolor{frametitle}{bg=DarkGray, fg=white}
\setbeamercolor{palette primary}{bg=HarvardCrimson, fg=white}
\title{Quantitative ML Systems}
\subtitle{From the Iron Law of Performance to Agentic Infrastructure Design}
\author{Machine Learning Systems Textbook Team}
\date{Conference Tutorial}
\begin{document}
\maketitle
% --- Section 1: The Problem ---
\section{The Reasoning Gap}
\begin{frame}{The Crisis in Systems Research}
\textbf{The Problem:} Systems are getting more complex, but the tools to think about them have not kept pace.
\vspace{0.5cm}
\begin{itemize}
\item \textbf{Cycle-Accurate Simulators:} Require hours to compile, weeks to run a single LLaMA-70B epoch.
\item \textbf{Spreadsheet Math:} Error-prone, silent unit conversions (GB vs GiB, MACs vs FLOPs), impossible to share or version-control.
\item \textbf{The Result:} The "Reasoning Gap". A student cannot requisition a 10,000-GPU cluster to test how a Ring-AllReduce topology affects latency.
\end{itemize}
\end{frame}
\begin{frame}{The Solution: MLSys$\cdot$im}
Taking inspiration from Hennessy \& Patterson's MIPS simulator: we sacrifice cycle accuracy for \textbf{taxonomic completeness} and \textbf{execution speed}.
\vspace{0.5cm}
\begin{block}{What is MLSys$\cdot$im?}
A pure-Python, dimensionally-strict analytical framework that evaluates the physics of ML workloads from single-node SRAM to 100,000-node datacenters in \emph{milliseconds}.
\end{block}
\end{frame}
% --- Section 2: The Core Physics ---
\section{The Iron Law \& The 22 Walls}
\begin{frame}{The Iron Law of ML Performance}
\centering
\Large
$$ T = \max\left( \frac{\text{OPs}}{\text{Peak}_{\text{FLOPS}} \times \eta}, \frac{\text{Bytes}}{BW_{\text{HBM}}} \right) + \text{Overhead} $$
\vspace{0.5cm}
\normalsize
\begin{itemize}
\item \textbf{The Memory Wall:} Why a 3.2x FLOPS upgrade (A100 $\rightarrow$ H100) often only yields a 1.7x speedup for LLMs.
\item \textbf{The Efficiency ($\eta$):} Absorbs micro-architectural chaos into a single verifiable parameter.
\end{itemize}
\end{frame}
\begin{frame}{The 5-Layer Stack}
We strictly decouple \emph{Demand} from \emph{Supply}.
\vspace{0.3cm}
\begin{enumerate}
\item \textbf{Workloads (Demand):} FLOPs, parameters, KV-cache needs.
\item \textbf{Hardware (Supply):} Silicon specs (Peak FLOPS, Bandwidth).
\item \textbf{Infrastructure (Environment):} Grid Carbon, PUE, WUE.
\item \textbf{Systems (Topology):} Fleet composition, Network fabrics.
\item \textbf{Solvers (Analysis):} The engines that evaluate the 22 physical walls.
\end{enumerate}
\end{frame}
% --- Section 3: Live Examples ---
\section{Live Physics: From Node to Fleet}
\begin{frame}{Example 1: The $\$9$ Million Question}
\textbf{Scenario:} Adding Chain-of-Thought (K=8) reasoning to production.
\vspace{0.3cm}
\textit{We use MLSys$\cdot$im to prove:}
\begin{itemize}
\item CoT scales at the memory-bound \emph{decode} rate (ITL).
\item A 7x latency increase requires 7x more GPUs to maintain QPS.
\item An algorithmic tweak turns a $\$1.2$M server bill into a $\$9.1$M capital expenditure.
\end{itemize}
\end{frame}
% --- Section 4: The Agentic Future ---
\section{The Climax: Agentic Infrastructure Design}
\begin{frame}{MLSys$\cdot$im as an Agent Protocol (MCP)}
\textbf{Vision:} AI designing AI infrastructure.
\vspace{0.3cm}
LLMs are terrible at math, but excellent at calling strictly-typed tools.
\begin{itemize}
\item \textbf{Bring Your Own YAML (BYOY):} Define chips declaratively.
\item \textbf{Model Context Protocol (MCP):} MLSys$\cdot$im provides a native MCP server exposing its JSON schema.
\item \textbf{The Agentic Loop:} Claude generates a cluster YAML $\rightarrow$ MLSys$\cdot$im evaluates it $\rightarrow$ returns physical bottleneck (e.g., OOM at batch 256) $\rightarrow$ Claude autonomously adjusts node count.
\end{itemize}
\end{frame}
\begin{frame}{Summary \& Call to Action}
\begin{center}
\Large \textbf{Be the Standard, Not the Bottleneck.}
\end{center}
\vspace{0.5cm}
\begin{itemize}
\item \texttt{pip install mlsysim}
\item Don't write a custom simulator for your next ISCA paper. Write an \texttt{mlsysim} plugin!
\item Connect it to your AI agents today.
\end{itemize}
\vspace{0.5cm}
\centering
\textbf{github.com/harvard-edge/cs249r\_book}
\end{frame}
\end{document}