diff --git a/mlsysim/tutorial/slides/tutorial_part1.tex b/mlsysim/tutorial/slides/tutorial_part1.tex index 60bfda865..f3c9259b4 100644 --- a/mlsysim/tutorial/slides/tutorial_part1.tex +++ b/mlsysim/tutorial/slides/tutorial_part1.tex @@ -603,6 +603,53 @@ FlashAttention & 60--75\% \\ \alert{Improving MFU is often cheaper than buying more GPUs.} \end{frame} +% --- Slide 1.8b: What Is Eta? --- +\begin{frame}{What Is $\eta$? (The Efficiency Parameter)} +\note{[3 min] CRITICAL — every demo uses eta. Explain it ONCE here, then it's understood for the day. +ANALOGY: ``eta is to ML systems what CPI is to CPU design --- an empirical constant that bridges peak specs and reality.''} + +\small +\textbf{$\eta$ = Achieved FLOPS / Peak FLOPS} (Model FLOPs Utilization) + +\vspace{0.3cm} +The gap between what your hardware \emph{could} do and what it \emph{actually} does. + +\vspace{0.3cm} +\begin{columns}[T] +\begin{column}{0.48\textwidth} +\textbf{What reduces $\eta$:} +\begin{itemize}\setlength\itemsep{1pt} +\item Kernel launch overhead +\item SM occupancy limits +\item Memory coalescing misses +\item Framework overhead (Python GIL) +\item Communication stalls +\end{itemize} +\end{column} +\begin{column}{0.48\textwidth} +\textbf{Typical values:} + +\scriptsize +\begin{tabular}{@{}lr@{}} +\toprule +Scenario & $\eta$ \\ +\midrule +Training (Megatron-LM) & 0.40--0.55 \\ +Training (PyTorch eager) & 0.08--0.15 \\ +Inference decode, bs=1 & 0.01--0.05 \\ +Inference decode, bs=32+ & 0.15--0.35 \\ +Inference prefill & 0.30--0.50 \\ +TinyML (TFLite Micro) & 0.05--0.15 \\ +\bottomrule +\end{tabular} +\end{column} +\end{columns} + +\vfill +\centering +\small\textcolor{gray}{You do not predict $\eta$ --- you measure it once and use it for what-if analysis.} +\end{frame} + % --- Slide 1.9: The Iron Law (Full) --- \begin{frame}{The Iron Law of ML Systems} \note{[3 min] Walk through each denominator term. Point out that every @@ -1492,6 +1539,34 @@ print(f"Prune: {r2.inference_speedup:.1f}x / " \end{lstlisting} \end{frame} +% --- Slide 3.7b: Compression Changes Fleet Architecture --- +\begin{frame}{Compression Changes Fleet Architecture} +\note{[3 min] This is the ``aha'' that compression is architecture, not optimization. +The punchline: INT4 halves your GPU count AND your electricity bill.} + +\small +\textbf{Llama-3 70B Serving Fleet:} + +\vspace{0.3cm} +\begin{tabular}{@{}lrrr@{}} +\toprule +Precision & Model Size & GPUs Needed & Annual Cost \\ +\midrule +FP16 & 140 GB & 4 (TP=4) & \$480K \\ +INT8 & 70 GB & 2 (TP=2) & \$240K \\ +INT4 & 35 GB & 1 & \$120K \\ +\bottomrule +\end{tabular} + +\vspace{0.3cm} +\textbf{INT4 doesn't just improve latency --- it eliminates 3 GPUs per replica.}\\ +At 100 replicas for 1000 QPS: that's \textbf{300 fewer GPUs} and \textbf{\$36M saved per year}. + +\vfill +\centering +\small\textcolor{gray}{This is why quantization is a Day 1 architectural decision, not a Day 100 optimization.} +\end{frame} + % --- Slide 3.8: Part 3 Key Takeaway --- \begin{frame}{Part 3: Key Takeaway} \note{[1 min] One sentence. Repeat. @@ -1645,6 +1720,37 @@ Bubble overhead & None & None & $\sim(P{-}1)/(M{+}P{-}1)$ \\ \end{tabular} \end{frame} +% --- Slide 4.3b: AllReduce Concrete Example --- +\begin{frame}[fragile]{AllReduce: A Concrete Example} +\note{[2 min] NUMBERS FIRST, then formula. Students need to see the magnitude before the algebra.} + +\small +\textbf{Setup:} 8 H100 GPUs, NVLink at 900 GB/s, Llama-3 8B (16 GB gradients) + +\vspace{0.3cm} +\begin{enumerate} +\item Each GPU computes its local gradient: \textbf{16 GB} +\item All 8 GPUs must end up with the \textbf{same averaged gradient} +\item Ring AllReduce passes chunks around the ring\ldots +\end{enumerate} + +\vspace{0.3cm} +\begin{lstlisting} +t = mlsysim.core.formulas.calc_ring_allreduce_time( + message_bytes=16e9, + n_gpus=8, + bandwidth_bytes_s=900e9, + latency_s=500e-9, +) +print(f"AllReduce time: {t.to('ms'):.1f}") +# -> ~35 ms (bandwidth-dominated, latency is negligible) +\end{lstlisting} + +\vfill +\centering +\textbf{35 ms} to synchronize 8 GPUs. Now: what happens at 256 GPUs? +\end{frame} + % --- Slide 4.4: Data Parallelism + AllReduce --- \begin{frame}{Wall 14: The Communication Wall (AllReduce)} \note{[3 min] ``Quick: 1 GB of gradients, 8 GPUs, 50 GB/s NVLink. diff --git a/mlsysim/tutorial/slides/tutorial_part2.tex b/mlsysim/tutorial/slides/tutorial_part2.tex index 755c5f823..6499a5667 100644 --- a/mlsysim/tutorial/slides/tutorial_part2.tex +++ b/mlsysim/tutorial/slides/tutorial_part2.tex @@ -102,6 +102,7 @@ Staff / Operations & 15\% & --- \\ % --- 5.4 The Sustainability Equation --- \begin{frame}{Wall 18: The Sustainability Wall} +\note{[3 min] Three levers: energy, PUE, carbon intensity. Geography dominates.} \begin{columns}[T] \column{0.55\textwidth} \begin{block}{The Equation} @@ -141,6 +142,7 @@ print(f"{r.carbon_footprint_kg % --- 5.5 Geography Matters --- \begin{frame}{Carbon Intensity: The 41$\times$ Gap} +\note{[2 min] Show the table. Poland vs Quebec: 41x difference. Same energy, different grid.} \begin{columns}[T] \column{0.45\textwidth} \centering @@ -170,6 +172,7 @@ $\Rightarrow$ \textbf{41$\times$} difference. % --- 5.6 Embodied Carbon --- \begin{frame}{Embodied Carbon: Manufacturing Dominates at the Edge} +\note{[2 min] Key insight: at cloud scale, operational carbon dominates. At TinyML scale, embodied carbon dominates.} \begin{columns}[T] \column{0.5\textwidth} \textbf{Cloud / Training} @@ -199,6 +202,7 @@ Systems with an Architectural Carbon Modeling Tool'' % --- 5.7 Live Demo: Economics --- \begin{frame}[fragile]{Live Demo: TCO + Carbon Analysis} +\note{[3 min] Run live. Show TCO and carbon side by side.} \begin{lstlisting} fleet = mlsysim.Systems.Clusters.Research_256 econ = mlsysim.EconomicsModel() @@ -214,6 +218,7 @@ print(f"Carbon: {result.carbon_footprint_kg/1000:.1f} t") % --- 5.8 Live Demo: Geography Comparison --- \begin{frame}[fragile]{Live Demo: The Carbon Geography Experiment} +\note{[2 min] Run live. Same energy, 41x less carbon. Geography wins.} \begin{lstlisting} fleet = mlsysim.Systems.Clusters.Research_256 solver = mlsysim.SustainabilityModel() @@ -289,6 +294,7 @@ for cluster_name in ["H100", "MI300X", "Gaudi3"]: % --- 5.10 Takeaway --- \begin{frame}{Key Takeaway: Economics \& Sustainability} +\note{[1 min] Three points, repeat each.} \begin{center} \Large \begin{enumerate} @@ -312,6 +318,7 @@ The cleanest watt is the one from a hydro dam.''} % --- 6.1 Key Question --- \begin{frame}{Key Question} +\note{[1 min] Frame the search problem. The space is combinatorial.} \begin{center} \Large\bfseries Given a budget and an SLA,\\[4pt] @@ -326,6 +333,7 @@ can easily exceed $10^4$ configurations. % --- 6.2 The DSE Pattern --- \begin{frame}[fragile]{The DSE Pattern: Declare, Search, Rank} +\note{[3 min] Three-step pattern. Emphasize: analytical models make exhaustive search feasible.} \begin{columns}[T] \column{0.5\textwidth} \begin{enumerate} @@ -368,6 +376,7 @@ takes $<$1\,ms $\Rightarrow$ exhaustive search. % --- 6.3 Pareto Fronts --- \begin{frame}{Pareto Fronts: No Free Lunch} +\note{[2 min] Explain the Pareto front. The knee is usually the sweet spot.} \centering \includegraphics[width=0.65\textwidth]{figures/pareto-placeholder.pdf} @@ -384,6 +393,7 @@ improving one metric \emph{must} worsen another. % --- 6.4 Live Demo: Engine.sweep --- \begin{frame}[fragile]{Live Demo: Design Space Sweep} +\note{[3 min] Run live. Note multi-vendor hardware list: H100, MI300X, Gaudi3, B200.} \begin{lstlisting} hw_list = [mlsysim.Hardware.Cloud.H100, mlsysim.Hardware.Cloud.MI300X, @@ -402,6 +412,7 @@ for r in sorted(results, % --- 6.5 Live Demo: DSE with Constraints --- \begin{frame}[fragile]{Live Demo: DSE with Objective \& Constraints} +\note{[3 min] Run live. Show maximize throughput subject to latency constraint.} \begin{lstlisting} from mlsysim.core.dse import DSE @@ -423,6 +434,7 @@ print(f"Best: {best['best_params']}") % --- 6.6 Batching Optimizer --- \begin{frame}[fragile]{Live Demo: Batching Optimizer (Pareto Front)} +\note{[3 min] Run live. Show Pareto front of batch size vs latency.} \begin{lstlisting} opt = mlsysim.BatchingOptimizer() result = opt.solve( @@ -457,6 +469,7 @@ B200 is faster but fewer GPUs fit in budget.} % --- 6.8 Takeaway --- \begin{frame}{Key Takeaway: Design Space Exploration} +\note{[1 min] Three points, repeat each.} \begin{center} \Large \begin{enumerate} @@ -480,6 +493,7 @@ You cannot model what you cannot measure.''} % --- 7.1 Key Question --- \begin{frame}{Key Question} +\note{[1 min] ``Same equation, 9 orders of magnitude apart.''} \begin{center} \Large\bfseries Can the same analytical framework model\\[4pt] @@ -493,6 +507,7 @@ Only the numbers change. The physics is the same. % --- 7.2 The Nine Orders of Magnitude --- \begin{frame}{The 9-Order-of-Magnitude Scale Span} +\note{[2 min] Walk through the table. Compute spans 10\^7x, power 10\^4.7x.} \begin{columns}[T] \column{0.52\textwidth} \centering @@ -526,6 +541,7 @@ H100 SXM & 989\,T & 700\,W \\ % --- 7.3 TinyML Memory Hierarchy --- \begin{frame}{Flash vs SRAM: The TinyML Memory Wall} +\note{[3 min] Key difference: TinyML has Flash (8 MB, 80 MB/s) vs Cloud HBM (80 GB, 3.35 TB/s).} \begin{columns}[T] \column{0.5\textwidth} \textbf{Cloud GPU (H100)} @@ -558,6 +574,7 @@ H100 SXM & 989\,T & 700\,W \\ % --- 7.4 Energy per Inference --- \begin{frame}{Energy per Inference: $\mu$J to Joules} +\note{[2 min] 6 orders of magnitude in energy. At TinyML scale, battery life is the constraint.} \centering \begin{tabular}{lrrl} \toprule @@ -581,6 +598,7 @@ H100 SXM & 700\,W & $\sim$50\,J & LLM inference \\ % --- 7.5 Live Demo: Hardware Comparison --- \begin{frame}[fragile]{Live Demo: nRF52840 vs ESP32 vs H100} +\note{[3 min] Run live. nRF52840 is memory-bound (Flash). H100 finishes in microseconds.} \begin{lstlisting} tiny_model = mlsysim.Models.Tiny.KeywordSpotting devices = [mlsysim.Hardware.Tiny.nRF52840, @@ -604,6 +622,7 @@ mlsysim also supports \textbf{Coral Edge TPU}, Jetson Orin, and Inferentia2. % --- 7.6 Same Roofline, Different Physics --- \begin{frame}[fragile]{Same Roofline, Different Physics} +\note{[2 min] Both are memory-bound at batch size 1. Same equation, same diagnosis.} \begin{columns}[T] \column{0.5\textwidth} \begin{center} @@ -674,6 +693,7 @@ ESP32-S3 at 400\,mW drains the battery in weeks without aggressive duty cycling. % --- 7.8 Takeaway --- \begin{frame}{Key Takeaway: TinyML to Frontier} +\note{[1 min] Three points. Right-size hardware to the workload.} \begin{center} \Large \begin{enumerate} @@ -697,6 +717,7 @@ that meets your latency and accuracy SLA.''} % --- 8.1 Key Question --- \begin{frame}{Key Question} +\note{[1 min] Single-wall analysis is not enough. You need the full pipeline.} \begin{center} \Large\bfseries How do you compose multiple analytical models\\[4pt]