diff --git a/mlsysim/tutorial/slides/tutorial_part1.tex b/mlsysim/tutorial/slides/tutorial_part1.tex
index 60bfda865..f3c9259b4 100644
--- a/mlsysim/tutorial/slides/tutorial_part1.tex
+++ b/mlsysim/tutorial/slides/tutorial_part1.tex
@@ -603,6 +603,53 @@ FlashAttention & 60--75\% \\
 \alert{Improving MFU is often cheaper than buying more GPUs.}
 \end{frame}
 
+% --- Slide 1.8b: What Is Eta? ---
+\begin{frame}{What Is $\eta$? (The Efficiency Parameter)}
+\note{[3 min] CRITICAL — every demo uses eta. Explain it ONCE here, then it's understood for the day.
+ANALOGY: ``eta is to ML systems what CPI is to CPU design --- an empirical constant that bridges peak specs and reality.''}
+
+\small
+\textbf{$\eta$ = Achieved FLOPS / Peak FLOPS} (Model FLOPs Utilization)
+
+\vspace{0.3cm}
+The gap between what your hardware \emph{could} do and what it \emph{actually} does.
+
+\vspace{0.3cm}
+\begin{columns}[T]
+\begin{column}{0.48\textwidth}
+\textbf{What reduces $\eta$:}
+\begin{itemize}\setlength\itemsep{1pt}
+\item Kernel launch overhead
+\item SM occupancy limits
+\item Memory coalescing misses
+\item Framework overhead (Python GIL)
+\item Communication stalls
+\end{itemize}
+\end{column}
+\begin{column}{0.48\textwidth}
+\textbf{Typical values:}
+
+\scriptsize
+\begin{tabular}{@{}lr@{}}
+\toprule
+Scenario & $\eta$ \\
+\midrule
+Training (Megatron-LM) & 0.40--0.55 \\
+Training (PyTorch eager) & 0.08--0.15 \\
+Inference decode, bs=1 & 0.01--0.05 \\
+Inference decode, bs=32+ & 0.15--0.35 \\
+Inference prefill & 0.30--0.50 \\
+TinyML (TFLite Micro) & 0.05--0.15 \\
+\bottomrule
+\end{tabular}
+\end{column}
+\end{columns}
+
+\vfill
+\centering
+\small\textcolor{gray}{You do not predict $\eta$ --- you measure it once and use it for what-if analysis.}
+\end{frame}
+
 % --- Slide 1.9: The Iron Law (Full) ---
 \begin{frame}{The Iron Law of ML Systems}
 \note{[3 min] Walk through each denominator term. Point out that every
@@ -1492,6 +1539,34 @@ print(f"Prune: {r2.inference_speedup:.1f}x / "
 \end{lstlisting}
 \end{frame}
 
+% --- Slide 3.7b: Compression Changes Fleet Architecture ---
+\begin{frame}{Compression Changes Fleet Architecture}
+\note{[3 min] This is the ``aha'' that compression is architecture, not optimization.
+The punchline: INT4 halves your GPU count AND your electricity bill.}
+
+\small
+\textbf{Llama-3 70B Serving Fleet:}
+
+\vspace{0.3cm}
+\begin{tabular}{@{}lrrr@{}}
+\toprule
+Precision & Model Size & GPUs Needed & Annual Cost \\
+\midrule
+FP16 & 140 GB & 4 (TP=4) & \$480K \\
+INT8 & 70 GB & 2 (TP=2) & \$240K \\
+INT4 & 35 GB & 1 & \$120K \\
+\bottomrule
+\end{tabular}
+
+\vspace{0.3cm}
+\textbf{INT4 doesn't just improve latency --- it eliminates 3 GPUs per replica.}\\
+At 100 replicas for 1000 QPS: that's \textbf{300 fewer GPUs} and \textbf{\$36M saved per year}.
+
+\vfill
+\centering
+\small\textcolor{gray}{This is why quantization is a Day 1 architectural decision, not a Day 100 optimization.}
+\end{frame}
+
 % --- Slide 3.8: Part 3 Key Takeaway ---
 \begin{frame}{Part 3: Key Takeaway}
 \note{[1 min] One sentence. Repeat.
@@ -1645,6 +1720,37 @@ Bubble overhead & None & None & $\sim(P{-}1)/(M{+}P{-}1)$ \\
 \end{tabular}
 \end{frame}
 
+% --- Slide 4.3b: AllReduce Concrete Example ---
+\begin{frame}[fragile]{AllReduce: A Concrete Example}
+\note{[2 min] NUMBERS FIRST, then formula. Students need to see the magnitude before the algebra.}
+
+\small
+\textbf{Setup:} 8 H100 GPUs, NVLink at 900 GB/s, Llama-3 8B (16 GB gradients)
+
+\vspace{0.3cm}
+\begin{enumerate}
+\item Each GPU computes its local gradient: \textbf{16 GB}
+\item All 8 GPUs must end up with the \textbf{same averaged gradient}
+\item Ring AllReduce passes chunks around the ring\ldots
+\end{enumerate}
+
+\vspace{0.3cm}
+\begin{lstlisting}
+t = mlsysim.core.formulas.calc_ring_allreduce_time(
+    message_bytes=16e9,
+    n_gpus=8,
+    bandwidth_bytes_s=900e9,
+    latency_s=500e-9,
+)
+print(f"AllReduce time: {t.to('ms'):.1f}")
+# -> ~35 ms (bandwidth-dominated, latency is negligible)
+\end{lstlisting}
+
+\vfill
+\centering
+\textbf{35 ms} to synchronize 8 GPUs. Now: what happens at 256 GPUs?
+\end{frame}
+
 % --- Slide 4.4: Data Parallelism + AllReduce ---
 \begin{frame}{Wall 14: The Communication Wall (AllReduce)}
 \note{[3 min] ``Quick: 1 GB of gradients, 8 GPUs, 50 GB/s NVLink.
diff --git a/mlsysim/tutorial/slides/tutorial_part2.tex b/mlsysim/tutorial/slides/tutorial_part2.tex
index 755c5f823..6499a5667 100644
--- a/mlsysim/tutorial/slides/tutorial_part2.tex
+++ b/mlsysim/tutorial/slides/tutorial_part2.tex
@@ -102,6 +102,7 @@ Staff / Operations & 15\% & --- \\
 
 % --- 5.4 The Sustainability Equation ---
 \begin{frame}{Wall 18: The Sustainability Wall}
+\note{[3 min] Three levers: energy, PUE, carbon intensity. Geography dominates.}
 \begin{columns}[T]
 \column{0.55\textwidth}
 \begin{block}{The Equation}
@@ -141,6 +142,7 @@ print(f"{r.carbon_footprint_kg
 
 % --- 5.5 Geography Matters ---
 \begin{frame}{Carbon Intensity: The 41$\times$ Gap}
+\note{[2 min] Show the table. Poland vs Quebec: 41x difference. Same energy, different grid.}
 \begin{columns}[T]
 \column{0.45\textwidth}
 \centering
@@ -170,6 +172,7 @@ $\Rightarrow$ \textbf{41$\times$} difference.
 
 % --- 5.6 Embodied Carbon ---
 \begin{frame}{Embodied Carbon: Manufacturing Dominates at the Edge}
+\note{[2 min] Key insight: at cloud scale, operational carbon dominates. At TinyML scale, embodied carbon dominates.}
 \begin{columns}[T]
 \column{0.5\textwidth}
 \textbf{Cloud / Training}
@@ -199,6 +202,7 @@ Systems with an Architectural Carbon Modeling Tool''
 
 % --- 5.7 Live Demo: Economics ---
 \begin{frame}[fragile]{Live Demo: TCO + Carbon Analysis}
+\note{[3 min] Run live. Show TCO and carbon side by side.}
 \begin{lstlisting}
 fleet = mlsysim.Systems.Clusters.Research_256
 econ  = mlsysim.EconomicsModel()
@@ -214,6 +218,7 @@ print(f"Carbon: {result.carbon_footprint_kg/1000:.1f} t")
 
 % --- 5.8 Live Demo: Geography Comparison ---
 \begin{frame}[fragile]{Live Demo: The Carbon Geography Experiment}
+\note{[2 min] Run live. Same energy, 41x less carbon. Geography wins.}
 \begin{lstlisting}
 fleet  = mlsysim.Systems.Clusters.Research_256
 solver = mlsysim.SustainabilityModel()
@@ -289,6 +294,7 @@ for cluster_name in ["H100", "MI300X", "Gaudi3"]:
 
 % --- 5.10 Takeaway ---
 \begin{frame}{Key Takeaway: Economics \& Sustainability}
+\note{[1 min] Three points, repeat each.}
 \begin{center}
 \Large
 \begin{enumerate}
@@ -312,6 +318,7 @@ The cleanest watt is the one from a hydro dam.''}
 
 % --- 6.1 Key Question ---
 \begin{frame}{Key Question}
+\note{[1 min] Frame the search problem. The space is combinatorial.}
 \begin{center}
 \Large\bfseries
 Given a budget and an SLA,\\[4pt]
@@ -326,6 +333,7 @@ can easily exceed $10^4$ configurations.
 
 % --- 6.2 The DSE Pattern ---
 \begin{frame}[fragile]{The DSE Pattern: Declare, Search, Rank}
+\note{[3 min] Three-step pattern. Emphasize: analytical models make exhaustive search feasible.}
 \begin{columns}[T]
 \column{0.5\textwidth}
 \begin{enumerate}
@@ -368,6 +376,7 @@ takes $<$1\,ms $\Rightarrow$ exhaustive search.
 
 % --- 6.3 Pareto Fronts ---
 \begin{frame}{Pareto Fronts: No Free Lunch}
+\note{[2 min] Explain the Pareto front. The knee is usually the sweet spot.}
 \centering
 \includegraphics[width=0.65\textwidth]{figures/pareto-placeholder.pdf}
 
@@ -384,6 +393,7 @@ improving one metric \emph{must} worsen another.
 
 % --- 6.4 Live Demo: Engine.sweep ---
 \begin{frame}[fragile]{Live Demo: Design Space Sweep}
+\note{[3 min] Run live. Note multi-vendor hardware list: H100, MI300X, Gaudi3, B200.}
 \begin{lstlisting}
 hw_list = [mlsysim.Hardware.Cloud.H100,
            mlsysim.Hardware.Cloud.MI300X,
@@ -402,6 +412,7 @@ for r in sorted(results,
 
 % --- 6.5 Live Demo: DSE with Constraints ---
 \begin{frame}[fragile]{Live Demo: DSE with Objective \& Constraints}
+\note{[3 min] Run live. Show maximize throughput subject to latency constraint.}
 \begin{lstlisting}
 from mlsysim.core.dse import DSE
 
@@ -423,6 +434,7 @@ print(f"Best: {best['best_params']}")
 
 % --- 6.6 Batching Optimizer ---
 \begin{frame}[fragile]{Live Demo: Batching Optimizer (Pareto Front)}
+\note{[3 min] Run live. Show Pareto front of batch size vs latency.}
 \begin{lstlisting}
 opt = mlsysim.BatchingOptimizer()
 result = opt.solve(
@@ -457,6 +469,7 @@ B200 is faster but fewer GPUs fit in budget.}
 
 % --- 6.8 Takeaway ---
 \begin{frame}{Key Takeaway: Design Space Exploration}
+\note{[1 min] Three points, repeat each.}
 \begin{center}
 \Large
 \begin{enumerate}
@@ -480,6 +493,7 @@ You cannot model what you cannot measure.''}
 
 % --- 7.1 Key Question ---
 \begin{frame}{Key Question}
+\note{[1 min] ``Same equation, 9 orders of magnitude apart.''}
 \begin{center}
 \Large\bfseries
 Can the same analytical framework model\\[4pt]
@@ -493,6 +507,7 @@ Only the numbers change. The physics is the same.
 
 % --- 7.2 The Nine Orders of Magnitude ---
 \begin{frame}{The 9-Order-of-Magnitude Scale Span}
+\note{[2 min] Walk through the table. Compute spans 10\^7x, power 10\^4.7x.}
 \begin{columns}[T]
 \column{0.52\textwidth}
 \centering
@@ -526,6 +541,7 @@ H100 SXM  & 989\,T & 700\,W \\
 
 % --- 7.3 TinyML Memory Hierarchy ---
 \begin{frame}{Flash vs SRAM: The TinyML Memory Wall}
+\note{[3 min] Key difference: TinyML has Flash (8 MB, 80 MB/s) vs Cloud HBM (80 GB, 3.35 TB/s).}
 \begin{columns}[T]
 \column{0.5\textwidth}
 \textbf{Cloud GPU (H100)}
@@ -558,6 +574,7 @@ H100 SXM  & 989\,T & 700\,W \\
 
 % --- 7.4 Energy per Inference ---
 \begin{frame}{Energy per Inference: $\mu$J to Joules}
+\note{[2 min] 6 orders of magnitude in energy. At TinyML scale, battery life is the constraint.}
 \centering
 \begin{tabular}{lrrl}
 \toprule
@@ -581,6 +598,7 @@ H100 SXM       & 700\,W  & $\sim$50\,J      & LLM inference \\
 
 % --- 7.5 Live Demo: Hardware Comparison ---
 \begin{frame}[fragile]{Live Demo: nRF52840 vs ESP32 vs H100}
+\note{[3 min] Run live. nRF52840 is memory-bound (Flash). H100 finishes in microseconds.}
 \begin{lstlisting}
 tiny_model = mlsysim.Models.Tiny.KeywordSpotting
 devices = [mlsysim.Hardware.Tiny.nRF52840,
@@ -604,6 +622,7 @@ mlsysim also supports \textbf{Coral Edge TPU}, Jetson Orin, and Inferentia2.
 
 % --- 7.6 Same Roofline, Different Physics ---
 \begin{frame}[fragile]{Same Roofline, Different Physics}
+\note{[2 min] Both are memory-bound at batch size 1. Same equation, same diagnosis.}
 \begin{columns}[T]
 \column{0.5\textwidth}
 \begin{center}
@@ -674,6 +693,7 @@ ESP32-S3 at 400\,mW drains the battery in weeks without aggressive duty cycling.
 
 % --- 7.8 Takeaway ---
 \begin{frame}{Key Takeaway: TinyML to Frontier}
+\note{[1 min] Three points. Right-size hardware to the workload.}
 \begin{center}
 \Large
 \begin{enumerate}
@@ -697,6 +717,7 @@ that meets your latency and accuracy SLA.''}
 
 % --- 8.1 Key Question ---
 \begin{frame}{Key Question}
+\note{[1 min] Single-wall analysis is not enough. You need the full pipeline.}
 \begin{center}
 \Large\bfseries
 How do you compose multiple analytical models\\[4pt]