mirror of
https://github.com/harvard-edge/cs249r_book.git
synced 2026-05-06 17:49:07 -05:00
fix(tutorial): simulation round 1 fixes — eta explanation, AllReduce numbers-first, compression fleet
1. Added 'Understanding eta' slide with table and CPI analogy (before first demo) 2. Added concrete AllReduce example with numbers BEFORE formula 3. Added 'Compression Changes Fleet Architecture' slide with cost table Addresses top 3 issues from simulation feedback round 1.
This commit is contained in:
@@ -603,6 +603,53 @@ FlashAttention & 60--75\% \\
|
||||
\alert{Improving MFU is often cheaper than buying more GPUs.}
|
||||
\end{frame}
|
||||
|
||||
% --- Slide 1.8b: What Is Eta? ---
|
||||
\begin{frame}{What Is $\eta$? (The Efficiency Parameter)}
|
||||
\note{[3 min] CRITICAL — every demo uses eta. Explain it ONCE here, then it's understood for the day.
|
||||
ANALOGY: ``eta is to ML systems what CPI is to CPU design --- an empirical constant that bridges peak specs and reality.''}
|
||||
|
||||
\small
|
||||
\textbf{$\eta$ = Achieved FLOPS / Peak FLOPS} (Model FLOPs Utilization)
|
||||
|
||||
\vspace{0.3cm}
|
||||
The gap between what your hardware \emph{could} do and what it \emph{actually} does.
|
||||
|
||||
\vspace{0.3cm}
|
||||
\begin{columns}[T]
|
||||
\begin{column}{0.48\textwidth}
|
||||
\textbf{What reduces $\eta$:}
|
||||
\begin{itemize}\setlength\itemsep{1pt}
|
||||
\item Kernel launch overhead
|
||||
\item SM occupancy limits
|
||||
\item Memory coalescing misses
|
||||
\item Framework overhead (Python GIL)
|
||||
\item Communication stalls
|
||||
\end{itemize}
|
||||
\end{column}
|
||||
\begin{column}{0.48\textwidth}
|
||||
\textbf{Typical values:}
|
||||
|
||||
\scriptsize
|
||||
\begin{tabular}{@{}lr@{}}
|
||||
\toprule
|
||||
Scenario & $\eta$ \\
|
||||
\midrule
|
||||
Training (Megatron-LM) & 0.40--0.55 \\
|
||||
Training (PyTorch eager) & 0.08--0.15 \\
|
||||
Inference decode, bs=1 & 0.01--0.05 \\
|
||||
Inference decode, bs=32+ & 0.15--0.35 \\
|
||||
Inference prefill & 0.30--0.50 \\
|
||||
TinyML (TFLite Micro) & 0.05--0.15 \\
|
||||
\bottomrule
|
||||
\end{tabular}
|
||||
\end{column}
|
||||
\end{columns}
|
||||
|
||||
\vfill
|
||||
\centering
|
||||
\small\textcolor{gray}{You do not predict $\eta$ --- you measure it once and use it for what-if analysis.}
|
||||
\end{frame}
|
||||
|
||||
% --- Slide 1.9: The Iron Law (Full) ---
|
||||
\begin{frame}{The Iron Law of ML Systems}
|
||||
\note{[3 min] Walk through each denominator term. Point out that every
|
||||
@@ -1492,6 +1539,34 @@ print(f"Prune: {r2.inference_speedup:.1f}x / "
|
||||
\end{lstlisting}
|
||||
\end{frame}
|
||||
|
||||
% --- Slide 3.7b: Compression Changes Fleet Architecture ---
|
||||
\begin{frame}{Compression Changes Fleet Architecture}
|
||||
\note{[3 min] This is the ``aha'' that compression is architecture, not optimization.
|
||||
The punchline: INT4 halves your GPU count AND your electricity bill.}
|
||||
|
||||
\small
|
||||
\textbf{Llama-3 70B Serving Fleet:}
|
||||
|
||||
\vspace{0.3cm}
|
||||
\begin{tabular}{@{}lrrr@{}}
|
||||
\toprule
|
||||
Precision & Model Size & GPUs Needed & Annual Cost \\
|
||||
\midrule
|
||||
FP16 & 140 GB & 4 (TP=4) & \$480K \\
|
||||
INT8 & 70 GB & 2 (TP=2) & \$240K \\
|
||||
INT4 & 35 GB & 1 & \$120K \\
|
||||
\bottomrule
|
||||
\end{tabular}
|
||||
|
||||
\vspace{0.3cm}
|
||||
\textbf{INT4 doesn't just improve latency --- it eliminates 3 GPUs per replica.}\\
|
||||
At 100 replicas for 1000 QPS: that's \textbf{300 fewer GPUs} and \textbf{\$36M saved per year}.
|
||||
|
||||
\vfill
|
||||
\centering
|
||||
\small\textcolor{gray}{This is why quantization is a Day 1 architectural decision, not a Day 100 optimization.}
|
||||
\end{frame}
|
||||
|
||||
% --- Slide 3.8: Part 3 Key Takeaway ---
|
||||
\begin{frame}{Part 3: Key Takeaway}
|
||||
\note{[1 min] One sentence. Repeat.
|
||||
@@ -1645,6 +1720,37 @@ Bubble overhead & None & None & $\sim(P{-}1)/(M{+}P{-}1)$ \\
|
||||
\end{tabular}
|
||||
\end{frame}
|
||||
|
||||
% --- Slide 4.3b: AllReduce Concrete Example ---
|
||||
\begin{frame}[fragile]{AllReduce: A Concrete Example}
|
||||
\note{[2 min] NUMBERS FIRST, then formula. Students need to see the magnitude before the algebra.}
|
||||
|
||||
\small
|
||||
\textbf{Setup:} 8 H100 GPUs, NVLink at 900 GB/s, Llama-3 8B (16 GB gradients)
|
||||
|
||||
\vspace{0.3cm}
|
||||
\begin{enumerate}
|
||||
\item Each GPU computes its local gradient: \textbf{16 GB}
|
||||
\item All 8 GPUs must end up with the \textbf{same averaged gradient}
|
||||
\item Ring AllReduce passes chunks around the ring\ldots
|
||||
\end{enumerate}
|
||||
|
||||
\vspace{0.3cm}
|
||||
\begin{lstlisting}
|
||||
t = mlsysim.core.formulas.calc_ring_allreduce_time(
|
||||
message_bytes=16e9,
|
||||
n_gpus=8,
|
||||
bandwidth_bytes_s=900e9,
|
||||
latency_s=500e-9,
|
||||
)
|
||||
print(f"AllReduce time: {t.to('ms'):.1f}")
|
||||
# -> ~35 ms (bandwidth-dominated, latency is negligible)
|
||||
\end{lstlisting}
|
||||
|
||||
\vfill
|
||||
\centering
|
||||
\textbf{35 ms} to synchronize 8 GPUs. Now: what happens at 256 GPUs?
|
||||
\end{frame}
|
||||
|
||||
% --- Slide 4.4: Data Parallelism + AllReduce ---
|
||||
\begin{frame}{Wall 14: The Communication Wall (AllReduce)}
|
||||
\note{[3 min] ``Quick: 1 GB of gradients, 8 GPUs, 50 GB/s NVLink.
|
||||
|
||||
@@ -102,6 +102,7 @@ Staff / Operations & 15\% & --- \\
|
||||
|
||||
% --- 5.4 The Sustainability Equation ---
|
||||
\begin{frame}{Wall 18: The Sustainability Wall}
|
||||
\note{[3 min] Three levers: energy, PUE, carbon intensity. Geography dominates.}
|
||||
\begin{columns}[T]
|
||||
\column{0.55\textwidth}
|
||||
\begin{block}{The Equation}
|
||||
@@ -141,6 +142,7 @@ print(f"{r.carbon_footprint_kg
|
||||
|
||||
% --- 5.5 Geography Matters ---
|
||||
\begin{frame}{Carbon Intensity: The 41$\times$ Gap}
|
||||
\note{[2 min] Show the table. Poland vs Quebec: 41x difference. Same energy, different grid.}
|
||||
\begin{columns}[T]
|
||||
\column{0.45\textwidth}
|
||||
\centering
|
||||
@@ -170,6 +172,7 @@ $\Rightarrow$ \textbf{41$\times$} difference.
|
||||
|
||||
% --- 5.6 Embodied Carbon ---
|
||||
\begin{frame}{Embodied Carbon: Manufacturing Dominates at the Edge}
|
||||
\note{[2 min] Key insight: at cloud scale, operational carbon dominates. At TinyML scale, embodied carbon dominates.}
|
||||
\begin{columns}[T]
|
||||
\column{0.5\textwidth}
|
||||
\textbf{Cloud / Training}
|
||||
@@ -199,6 +202,7 @@ Systems with an Architectural Carbon Modeling Tool''
|
||||
|
||||
% --- 5.7 Live Demo: Economics ---
|
||||
\begin{frame}[fragile]{Live Demo: TCO + Carbon Analysis}
|
||||
\note{[3 min] Run live. Show TCO and carbon side by side.}
|
||||
\begin{lstlisting}
|
||||
fleet = mlsysim.Systems.Clusters.Research_256
|
||||
econ = mlsysim.EconomicsModel()
|
||||
@@ -214,6 +218,7 @@ print(f"Carbon: {result.carbon_footprint_kg/1000:.1f} t")
|
||||
|
||||
% --- 5.8 Live Demo: Geography Comparison ---
|
||||
\begin{frame}[fragile]{Live Demo: The Carbon Geography Experiment}
|
||||
\note{[2 min] Run live. Same energy, 41x less carbon. Geography wins.}
|
||||
\begin{lstlisting}
|
||||
fleet = mlsysim.Systems.Clusters.Research_256
|
||||
solver = mlsysim.SustainabilityModel()
|
||||
@@ -289,6 +294,7 @@ for cluster_name in ["H100", "MI300X", "Gaudi3"]:
|
||||
|
||||
% --- 5.10 Takeaway ---
|
||||
\begin{frame}{Key Takeaway: Economics \& Sustainability}
|
||||
\note{[1 min] Three points, repeat each.}
|
||||
\begin{center}
|
||||
\Large
|
||||
\begin{enumerate}
|
||||
@@ -312,6 +318,7 @@ The cleanest watt is the one from a hydro dam.''}
|
||||
|
||||
% --- 6.1 Key Question ---
|
||||
\begin{frame}{Key Question}
|
||||
\note{[1 min] Frame the search problem. The space is combinatorial.}
|
||||
\begin{center}
|
||||
\Large\bfseries
|
||||
Given a budget and an SLA,\\[4pt]
|
||||
@@ -326,6 +333,7 @@ can easily exceed $10^4$ configurations.
|
||||
|
||||
% --- 6.2 The DSE Pattern ---
|
||||
\begin{frame}[fragile]{The DSE Pattern: Declare, Search, Rank}
|
||||
\note{[3 min] Three-step pattern. Emphasize: analytical models make exhaustive search feasible.}
|
||||
\begin{columns}[T]
|
||||
\column{0.5\textwidth}
|
||||
\begin{enumerate}
|
||||
@@ -368,6 +376,7 @@ takes $<$1\,ms $\Rightarrow$ exhaustive search.
|
||||
|
||||
% --- 6.3 Pareto Fronts ---
|
||||
\begin{frame}{Pareto Fronts: No Free Lunch}
|
||||
\note{[2 min] Explain the Pareto front. The knee is usually the sweet spot.}
|
||||
\centering
|
||||
\includegraphics[width=0.65\textwidth]{figures/pareto-placeholder.pdf}
|
||||
|
||||
@@ -384,6 +393,7 @@ improving one metric \emph{must} worsen another.
|
||||
|
||||
% --- 6.4 Live Demo: Engine.sweep ---
|
||||
\begin{frame}[fragile]{Live Demo: Design Space Sweep}
|
||||
\note{[3 min] Run live. Note multi-vendor hardware list: H100, MI300X, Gaudi3, B200.}
|
||||
\begin{lstlisting}
|
||||
hw_list = [mlsysim.Hardware.Cloud.H100,
|
||||
mlsysim.Hardware.Cloud.MI300X,
|
||||
@@ -402,6 +412,7 @@ for r in sorted(results,
|
||||
|
||||
% --- 6.5 Live Demo: DSE with Constraints ---
|
||||
\begin{frame}[fragile]{Live Demo: DSE with Objective \& Constraints}
|
||||
\note{[3 min] Run live. Show maximize throughput subject to latency constraint.}
|
||||
\begin{lstlisting}
|
||||
from mlsysim.core.dse import DSE
|
||||
|
||||
@@ -423,6 +434,7 @@ print(f"Best: {best['best_params']}")
|
||||
|
||||
% --- 6.6 Batching Optimizer ---
|
||||
\begin{frame}[fragile]{Live Demo: Batching Optimizer (Pareto Front)}
|
||||
\note{[3 min] Run live. Show Pareto front of batch size vs latency.}
|
||||
\begin{lstlisting}
|
||||
opt = mlsysim.BatchingOptimizer()
|
||||
result = opt.solve(
|
||||
@@ -457,6 +469,7 @@ B200 is faster but fewer GPUs fit in budget.}
|
||||
|
||||
% --- 6.8 Takeaway ---
|
||||
\begin{frame}{Key Takeaway: Design Space Exploration}
|
||||
\note{[1 min] Three points, repeat each.}
|
||||
\begin{center}
|
||||
\Large
|
||||
\begin{enumerate}
|
||||
@@ -480,6 +493,7 @@ You cannot model what you cannot measure.''}
|
||||
|
||||
% --- 7.1 Key Question ---
|
||||
\begin{frame}{Key Question}
|
||||
\note{[1 min] ``Same equation, 9 orders of magnitude apart.''}
|
||||
\begin{center}
|
||||
\Large\bfseries
|
||||
Can the same analytical framework model\\[4pt]
|
||||
@@ -493,6 +507,7 @@ Only the numbers change. The physics is the same.
|
||||
|
||||
% --- 7.2 The Nine Orders of Magnitude ---
|
||||
\begin{frame}{The 9-Order-of-Magnitude Scale Span}
|
||||
\note{[2 min] Walk through the table. Compute spans 10\^7x, power 10\^4.7x.}
|
||||
\begin{columns}[T]
|
||||
\column{0.52\textwidth}
|
||||
\centering
|
||||
@@ -526,6 +541,7 @@ H100 SXM & 989\,T & 700\,W \\
|
||||
|
||||
% --- 7.3 TinyML Memory Hierarchy ---
|
||||
\begin{frame}{Flash vs SRAM: The TinyML Memory Wall}
|
||||
\note{[3 min] Key difference: TinyML has Flash (8 MB, 80 MB/s) vs Cloud HBM (80 GB, 3.35 TB/s).}
|
||||
\begin{columns}[T]
|
||||
\column{0.5\textwidth}
|
||||
\textbf{Cloud GPU (H100)}
|
||||
@@ -558,6 +574,7 @@ H100 SXM & 989\,T & 700\,W \\
|
||||
|
||||
% --- 7.4 Energy per Inference ---
|
||||
\begin{frame}{Energy per Inference: $\mu$J to Joules}
|
||||
\note{[2 min] 6 orders of magnitude in energy. At TinyML scale, battery life is the constraint.}
|
||||
\centering
|
||||
\begin{tabular}{lrrl}
|
||||
\toprule
|
||||
@@ -581,6 +598,7 @@ H100 SXM & 700\,W & $\sim$50\,J & LLM inference \\
|
||||
|
||||
% --- 7.5 Live Demo: Hardware Comparison ---
|
||||
\begin{frame}[fragile]{Live Demo: nRF52840 vs ESP32 vs H100}
|
||||
\note{[3 min] Run live. nRF52840 is memory-bound (Flash). H100 finishes in microseconds.}
|
||||
\begin{lstlisting}
|
||||
tiny_model = mlsysim.Models.Tiny.KeywordSpotting
|
||||
devices = [mlsysim.Hardware.Tiny.nRF52840,
|
||||
@@ -604,6 +622,7 @@ mlsysim also supports \textbf{Coral Edge TPU}, Jetson Orin, and Inferentia2.
|
||||
|
||||
% --- 7.6 Same Roofline, Different Physics ---
|
||||
\begin{frame}[fragile]{Same Roofline, Different Physics}
|
||||
\note{[2 min] Both are memory-bound at batch size 1. Same equation, same diagnosis.}
|
||||
\begin{columns}[T]
|
||||
\column{0.5\textwidth}
|
||||
\begin{center}
|
||||
@@ -674,6 +693,7 @@ ESP32-S3 at 400\,mW drains the battery in weeks without aggressive duty cycling.
|
||||
|
||||
% --- 7.8 Takeaway ---
|
||||
\begin{frame}{Key Takeaway: TinyML to Frontier}
|
||||
\note{[1 min] Three points. Right-size hardware to the workload.}
|
||||
\begin{center}
|
||||
\Large
|
||||
\begin{enumerate}
|
||||
@@ -697,6 +717,7 @@ that meets your latency and accuracy SLA.''}
|
||||
|
||||
% --- 8.1 Key Question ---
|
||||
\begin{frame}{Key Question}
|
||||
\note{[1 min] Single-wall analysis is not enough. You need the full pipeline.}
|
||||
\begin{center}
|
||||
\Large\bfseries
|
||||
How do you compose multiple analytical models\\[4pt]
|
||||
|
||||
Reference in New Issue
Block a user