mirror of
https://github.com/harvard-edge/cs249r_book.git
synced 2026-05-06 01:28:35 -05:00
Align the MLSys·im code, docs, paper, website, workflows, and lab wheel for the 0.1.1 release. This also fixes runtime/API issues found during release review and prepares the paper PDF plus archive package.
119 lines
4.7 KiB
TeX
119 lines
4.7 KiB
TeX
\documentclass[aspectratio=169, 10pt]{beamer}
|
|
|
|
% Theme and styling to match MLSysBook aesthetic
|
|
\usetheme{metropolis}
|
|
\definecolor{HarvardCrimson}{RGB}{165, 28, 48}
|
|
\definecolor{DarkGray}{RGB}{51, 51, 51}
|
|
\setbeamercolor{frametitle}{bg=DarkGray, fg=white}
|
|
\setbeamercolor{palette primary}{bg=HarvardCrimson, fg=white}
|
|
|
|
\title{Quantitative ML Systems}
|
|
\subtitle{From the Iron Law of Performance to Agentic Infrastructure Design}
|
|
\author{Machine Learning Systems Textbook Team}
|
|
\date{Conference Tutorial}
|
|
|
|
\begin{document}
|
|
|
|
\maketitle
|
|
|
|
% --- Section 1: The Problem ---
|
|
\section{The Reasoning Gap}
|
|
|
|
\begin{frame}{The Crisis in Systems Research}
|
|
\textbf{The Problem:} Systems are getting more complex, but the tools to think about them have not kept pace.
|
|
\vspace{0.5cm}
|
|
|
|
\begin{itemize}
|
|
\item \textbf{Cycle-Accurate Simulators:} Require hours to compile, weeks to run a single LLaMA-70B epoch.
|
|
\item \textbf{Spreadsheet Math:} Error-prone, silent unit conversions (GB vs GiB, MACs vs FLOPs), impossible to share or version-control.
|
|
\item \textbf{The Result:} The "Reasoning Gap". A student cannot requisition a 10,000-GPU cluster to test how a Ring-AllReduce topology affects latency.
|
|
\end{itemize}
|
|
\end{frame}
|
|
|
|
\begin{frame}{The Solution: MLSys$\cdot$im}
|
|
Taking inspiration from Hennessy \& Patterson's MIPS simulator: we sacrifice cycle accuracy for \textbf{taxonomic completeness} and \textbf{execution speed}.
|
|
\vspace{0.5cm}
|
|
|
|
\begin{block}{What is MLSys$\cdot$im?}
|
|
A pure-Python, dimensionally-strict analytical framework that evaluates the physics of ML workloads from single-node SRAM to 100,000-node datacenters in \emph{milliseconds}.
|
|
\end{block}
|
|
\end{frame}
|
|
|
|
% --- Section 2: The Core Physics ---
|
|
\section{The Iron Law \& The 22 Walls}
|
|
|
|
\begin{frame}{The Iron Law of ML Performance}
|
|
\centering
|
|
\Large
|
|
$$ T = \max\left( \frac{\text{OPs}}{\text{Peak}_{\text{FLOPS}} \times \eta}, \frac{\text{Bytes}}{BW_{\text{HBM}}} \right) + \text{Overhead} $$
|
|
\vspace{0.5cm}
|
|
|
|
|
|
\normalsize
|
|
\begin{itemize}
|
|
\item \textbf{The Memory Wall:} Why a 3.2x FLOPS upgrade (A100 $\rightarrow$ H100) often only yields a 1.7x speedup for LLMs.
|
|
\item \textbf{The Efficiency ($\eta$):} Absorbs micro-architectural chaos into a single verifiable parameter.
|
|
\end{itemize}
|
|
\end{frame}
|
|
|
|
\begin{frame}{The 5-Layer Stack}
|
|
We strictly decouple \emph{Demand} from \emph{Supply}.
|
|
\vspace{0.3cm}
|
|
|
|
\begin{enumerate}
|
|
\item \textbf{Workloads (Demand):} FLOPs, parameters, KV-cache needs.
|
|
\item \textbf{Hardware (Supply):} Silicon specs (Peak FLOPS, Bandwidth).
|
|
\item \textbf{Infrastructure (Environment):} Grid Carbon, PUE, WUE.
|
|
\item \textbf{Systems (Topology):} Fleet composition, Network fabrics.
|
|
\item \textbf{Solvers (Analysis):} The engines that evaluate the 22 physical walls.
|
|
\end{enumerate}
|
|
\end{frame}
|
|
|
|
% --- Section 3: Live Examples ---
|
|
\section{Live Physics: From Node to Fleet}
|
|
|
|
\begin{frame}{Example 1: The $\$9$ Million Question}
|
|
\textbf{Scenario:} Adding Chain-of-Thought (K=8) reasoning to production.
|
|
|
|
\vspace{0.3cm}
|
|
\textit{We use MLSys$\cdot$im to prove:}
|
|
\begin{itemize}
|
|
\item CoT scales at the memory-bound \emph{decode} rate (ITL).
|
|
\item A 7x latency increase requires 7x more GPUs to maintain QPS.
|
|
\item An algorithmic tweak turns a $\$1.2$M server bill into a $\$9.1$M capital expenditure.
|
|
\end{itemize}
|
|
\end{frame}
|
|
|
|
% --- Section 4: The Agentic Future ---
|
|
\section{The Climax: Agentic Infrastructure Design}
|
|
|
|
\begin{frame}{MLSys$\cdot$im as an Agent Protocol (MCP)}
|
|
\textbf{Vision:} AI designing AI infrastructure.
|
|
\vspace{0.3cm}
|
|
|
|
LLMs are terrible at math, but excellent at calling strictly-typed tools.
|
|
\begin{itemize}
|
|
\item \textbf{Bring Your Own YAML (BYOY):} Define chips declaratively.
|
|
\item \textbf{Model Context Protocol (MCP):} MLSys$\cdot$im provides a native MCP server exposing its JSON schema.
|
|
\item \textbf{The Agentic Loop:} Claude generates a cluster YAML $\rightarrow$ MLSys$\cdot$im evaluates it $\rightarrow$ returns physical bottleneck (e.g., OOM at batch 256) $\rightarrow$ Claude autonomously adjusts node count.
|
|
\end{itemize}
|
|
\end{frame}
|
|
|
|
\begin{frame}{Summary \& Call to Action}
|
|
\begin{center}
|
|
\Large \textbf{Be the Standard, Not the Bottleneck.}
|
|
\end{center}
|
|
|
|
\vspace{0.5cm}
|
|
\begin{itemize}
|
|
\item \texttt{pip install mlsysim}
|
|
\item Don't write a custom simulator for your next ISCA paper. Write an \texttt{mlsysim} plugin!
|
|
\item Connect it to your AI agents today.
|
|
\end{itemize}
|
|
\vspace{0.5cm}
|
|
\centering
|
|
\textbf{github.com/harvard-edge/cs249r\_book}
|
|
\end{frame}
|
|
|
|
\end{document}
|