import marimo __generated_with = "0.19.6" app = marimo.App(width="full") # ───────────────────────────────────────────────────────────────────────────── # LAB 07: THE KERNEL FUSION DIVIDEND # # Chapter: ML Frameworks (@sec-ml-frameworks) # Core Invariant: Every unfused kernel launch pays a dispatch tax (5–20 μs). # Kernel fusion reduces memory round-trips by keeping data on-chip. A fused # LayerNorm + Dropout + ReLU sequence can yield 5× wall-clock speedup vs # 3 separate kernels. torch.compile provides 1.3–2× throughput gain by # reducing kernel launch overhead. # # Act 1 (12 min): The Dispatch Tax Audit # Wrong prior: students believe larger models benefit most from compilation. # Reality: KWS (1,000 small kernels) has 33% GPU utilization at 5 μs compute # + 10 μs dispatch per kernel. Compilation raises utilization to ~67%. # GPT-2 (20 large kernels) already has 90% utilization — dispatch is negligible. # # Act 2 (22 min): The Compilation Break-Even # Wrong prior: "compile once, run fast forever" is always net positive. # Reality: 30-second compile time on ResNet-50 requires ~134,000 images to # break even. KWS on a Cloud server recovers quickly; on edge, the deployment # may expire before break-even is reached. # # 2 Contexts: Cloud (A100, 312 TFLOPS FP16, 2.0 TB/s HBM2e, 10M req/day) # Edge (Jetson Orin NX, 100 TOPS INT8, 102 GB/s, 100 req/hr) # # Design Ledger: saves execution_mode, fusion_enabled, compilation_roi_positive, # breakeven_inferences, kws_utilization_eager_pct, kws_utilization_compiled_pct # Downstream: Lab 08 (Training MFU), Lab 11 (Roofline arithmetic intensity) # ───────────────────────────────────────────────────────────────────────────── # ─── CELL 0: SETUP (hide_code=False — leave visible) ───────────────────────── @app.cell def _(): import marimo as mo import sys from pathlib import Path import plotly.graph_objects as go import numpy as np import math _root = Path(__file__).resolve().parents[2] if str(_root) not in sys.path: sys.path.insert(0, str(_root)) from labs.core.state import DesignLedger from labs.core.style import COLORS, LAB_CSS, apply_plotly_theme ledger = DesignLedger() # ── HARDWARE CONSTANTS (traceability: frameworks.qmd) ─────────────────── # A100 FP16 dense TFLOPS — frameworks.qmd line ~101 (A100BLAS.dense_tflops_str = 312) A100_TFLOPS_FP16 = 312 # TFLOPS dense FP16 # A100 HBM2e bandwidth — frameworks.qmd line ~293 (MemoryWallSpecs: "2.0 TB/s") A100_BW_TBS = 2.0 # TB/s = 2,000 GB/s A100_BW_GBS = 2000 # GB/s # Roofline ridge point: 312 TFLOPS / 2.0 TB/s = 156 FLOP/byte A100_RIDGE_FLOP_PER_BYTE = 156 # frameworks.qmd implied (§ Memory Wall) # A100 RAM A100_RAM_GB = 80 # GB HBM2e # Jetson Orin NX specs (NVIDIA datasheet, representative edge device) ORIN_TOPS_INT8 = 100 # TOPS INT8 ORIN_BW_GBS = 102 # GB/s ORIN_RAM_GB = 16 # GB ORIN_TDP_W = 25 # Watts # Dispatch tax constants — frameworks.qmd line ~307 # "Each kernel launch incurs 5–20 μs of CPU-side overhead" DISPATCH_US_CLOUD = 10 # μs per kernel launch on cloud (mid of 5–20 range) DISPATCH_US_EDGE = 50 # μs per kernel on edge (higher OS overhead) # torch.compile ResNet-50 data — frameworks.qmd line ~1283 # "torch.compile provides ~48% speedup on ResNet-50 (2,150 vs 1,450 img/sec)" RESNET50_THROUGHPUT_EAGER = 1450 # img/sec RESNET50_THROUGHPUT_COMPILED = 2150 # img/sec RESNET50_COMPILE_TIME_S = 30 # seconds # Break-even formula (frameworks.qmd, derivable from the above) # N_breakeven = t_compile / (1/R_eager - 1/R_compiled) _delta_t_per_img = (1 / RESNET50_THROUGHPUT_EAGER) - (1 / RESNET50_THROUGHPUT_COMPILED) RESNET50_BREAKEVEN_IMAGES = int(RESNET50_COMPILE_TIME_S / _delta_t_per_img) # ~134,000 # GPU utilization range from framework choice — frameworks.qmd line ~2663 # "whether a training loop achieves 30% or 80% of theoretical hardware throughput" UTILIZATION_LOW_PCT = 30 # % (eager, small-kernel model) UTILIZATION_HIGH_PCT = 80 # % (compiled, optimized) return ( mo, go, np, math, ledger, COLORS, LAB_CSS, apply_plotly_theme, A100_TFLOPS_FP16, A100_BW_TBS, A100_BW_GBS, A100_RIDGE_FLOP_PER_BYTE, A100_RAM_GB, ORIN_TOPS_INT8, ORIN_BW_GBS, ORIN_RAM_GB, ORIN_TDP_W, DISPATCH_US_CLOUD, DISPATCH_US_EDGE, RESNET50_THROUGHPUT_EAGER, RESNET50_THROUGHPUT_COMPILED, RESNET50_COMPILE_TIME_S, RESNET50_BREAKEVEN_IMAGES, UTILIZATION_LOW_PCT, UTILIZATION_HIGH_PCT, ) # ─── CELL 1: HEADER ─────────────────────────────────────────────────────────── @app.cell(hide_code=True) def _(mo, LAB_CSS, COLORS): mo.vstack([ LAB_CSS, mo.Html(f"""
Machine Learning Systems · Volume I · Lab 07

The Kernel Fusion Dividend

Every unfused kernel launch pays a dispatch tax. At 10 μs per launch and 1,000 kernels per forward pass, the GPU is busy for only 33% of wall time. Kernel fusion and compilation recover the rest — but the payoff depends entirely on your kernel count and deployment volume.

2 Acts · 35–40 min Chapter 7: ML Frameworks Read @sec-ml-frameworks first
Cloud (A100)
312 TFLOPS · 2.0 TB/s
Edge (Jetson Orin NX)
100 TOPS · 102 GB/s
Invariant
Dispatch Tax: 5–20 μs/kernel
"""), ]) return # ─── CELL 2: RECOMMENDED READING ────────────────────────────────────────────── @app.cell(hide_code=True) def _(mo): mo.callout(mo.md(""" **Recommended Reading** — Complete the following sections of @sec-ml-frameworks before this lab: - **The Memory Wall** (@sec-ml-frameworks-execution-strategy-matters-memory-wall-1ce8) — The gap between 312 TFLOPS of compute and 2.0 TB/s of bandwidth is why element-wise ops like ReLU achieve less than 1% of peak compute. - **Kernel Fusion and the Dispatch Tax** — Framework compilation fuses adjacent operations, reducing kernel launches from N to 1 and eliminating intermediate HBM reads/writes. - **Compilation Continuum** — The spectrum from eager execution (debug-friendly, optimization-limited) to fully compiled graphs (latency to compile, 1.3–2× throughput gain). """), kind="info") return # ─── CELL 3: CONTEXT TOGGLE ──────────────────────────────────────────────────── @app.cell(hide_code=True) def _(mo, COLORS): context_toggle = mo.ui.radio( options={ "Cloud (A100 — 10M requests/day)": "cloud", "Edge (Jetson Orin NX — 100 requests/hour)": "edge", }, value="Cloud (A100 — 10M requests/day)", label="Deployment context:", inline=True, ) mo.vstack([ mo.md("---"), mo.Html(f"""
Deployment Context — Applied to Both Acts
The dispatch tax per kernel is 10 μs on Cloud (A100) and 50 μs on Edge (Jetson) — higher OS and driver overhead per launch relative to compute time. Switch contexts to see how the same model behaves under different overhead regimes.
"""), context_toggle, ]) return (context_toggle,) # ───────────────────────────────────────────────────────────────────────────── # ACT 1: THE DISPATCH TAX AUDIT # ───────────────────────────────────────────────────────────────────────────── @app.cell(hide_code=True) def _(mo, COLORS, context_toggle): _ctx = context_toggle.value _dispatch_us = 10 if _ctx == "cloud" else 50 _ctx_label = "Cloud (A100)" if _ctx == "cloud" else "Edge (Jetson Orin NX)" _ctx_color = COLORS["Cloud"] if _ctx == "cloud" else COLORS["Edge"] mo.vstack([ mo.md("---"), mo.Html(f"""
Act 1 · 12 min
Context: {_ctx_label} · {_dispatch_us} μs dispatch tax per kernel
"""), mo.md("## The Dispatch Tax Audit"), mo.Html(f"""
Incoming Message · ML Infra Lead
"Our Keyword Spotting model's transformer inference is 3× slower than the paper reports. Same hardware, same model. The profiler shows 1,000 kernel launches per forward pass. Each kernel averages 5 μs of GPU compute. What fraction of wall time is the GPU actually doing math?"
"""), ]) return # ─── ACT 1: PREDICTION LOCK ─────────────────────────────────────────────────── @app.cell(hide_code=True) def _(mo): act1_pred = mo.ui.radio( options={ "A) About 90% — the GPU is the bottleneck, not the CPU": "A", "B) About 50% — half compute, half overhead": "B", "C) About 33% — compute (5 μs) is one-third of total time (5 + 10 μs)": "C", "D) About 5% — overhead completely dominates": "D", }, label="""**Commit your prediction before unlocking the instruments.** A Keyword Spotting model performs 1,000 kernel launches per forward pass. Each kernel computes for 5 μs on average. Each kernel *launch* costs 10 μs of CPU-side dispatch overhead. What fraction of wall-clock time is the GPU actually performing tensor operations?""", ) mo.vstack([ act1_pred, mo.callout(mo.md("Select your prediction to unlock the Act 1 instruments."), kind="warn") if act1_pred.value is None else mo.md(""), ]) return (act1_pred,) # ─── ACT 1: GATE ────────────────────────────────────────────────────────────── @app.cell(hide_code=True) def _(mo, act1_pred): mo.stop( act1_pred.value is None, mo.callout(mo.md("Select your Act 1 prediction above to unlock the Dispatch Tax Waterfall."), kind="warn"), ) return # ─── ACT 1: INSTRUMENTS ─────────────────────────────────────────────────────── @app.cell(hide_code=True) def _(mo): act1_model_selector = mo.ui.dropdown( options={ "KWS (Keyword Spotting) — 1,000 small kernels, 5 μs each": "kws", "ResNet-50 — 200 medium kernels, 50 μs each": "resnet50", "GPT-2 Layer — 20 large kernels, 500 μs each": "gpt2", }, value="KWS (Keyword Spotting) — 1,000 small kernels, 5 μs each", label="Model type", ) act1_mode_toggle = mo.ui.radio( options={ "Eager (unfused)": "eager", "Compiled (fused)": "compiled", }, value="Eager (unfused)", label="Execution mode", inline=True, ) mo.hstack([ mo.vstack([mo.md("**Model type**"), act1_model_selector]), mo.vstack([mo.md("**Execution mode**"), act1_mode_toggle]), ], justify="start", gap=3) return (act1_model_selector, act1_mode_toggle) # ─── ACT 1: PHYSICS ENGINE + VISUALIZATION ──────────────────────────────────── @app.cell(hide_code=True) def _( mo, go, np, act1_model_selector, act1_mode_toggle, context_toggle, COLORS, apply_plotly_theme, DISPATCH_US_CLOUD, DISPATCH_US_EDGE, ): # ── Physics constants (traceability: frameworks.qmd) ───────────────────── # Dispatch tax per kernel — line ~307: "5–20 μs of CPU-side overhead" _ctx = context_toggle.value _dispatch_us = DISPATCH_US_CLOUD if _ctx == "cloud" else DISPATCH_US_EDGE # Model parameters: (kernel_count, compute_us_per_kernel, compiled_kernel_reduction_pct) # KWS: 1,000 small kernels — frameworks.qmd plan §3 # ResNet-50: 200 medium kernels — realistic estimate for 50-layer residual network # GPT-2 Layer: 20 large kernels (matmul-dominant) — minimal dispatch _model_params = { "kws": {"n": 1000, "compute_us": 5, "compiled_reduction": 0.50}, "resnet50": {"n": 200, "compute_us": 50, "compiled_reduction": 0.35}, "gpt2": {"n": 20, "compute_us": 500, "compiled_reduction": 0.15}, } _mode = act1_mode_toggle.value _model_key = act1_model_selector.value.split(" — ")[0].lower().replace("-", "").replace(" ", "") # Map display names to keys _key_map = { "kws": "kws", "keyword": "kws", "resnet50": "resnet50", "resnet": "resnet50", "gpt2": "gpt2", "gpt": "gpt2", } _mk = "kws" for _k in _key_map: if _k in _model_key: _mk = _key_map[_k] break _p = _model_params[_mk] # In compiled mode: kernel count reduces, dispatch overhead drops proportionally # Compute time is unchanged (same work, less fragmentation overhead) if _mode == "compiled": _n_kernels = int(_p["n"] * (1 - _p["compiled_reduction"])) else: _n_kernels = _p["n"] # Total times (μs) _total_compute_us = _p["n"] * _p["compute_us"] # same total computation always _total_dispatch_us = _n_kernels * _dispatch_us # Memory transfer: estimated from arithmetic intensity # Element-wise ops (KWS): low AI (~0.1 FLOP/byte) # ResNet conv: medium AI (~10 FLOP/byte) # GPT-2 matmul: high AI (~100 FLOP/byte) _ai_map = {"kws": 0.1, "resnet50": 8.0, "gpt2": 80.0} _ai = _ai_map[_mk] # Memory transfer from arithmetic intensity: t_mem = compute_flops / (AI * BW) # Using A100 BW (2000 GB/s) as baseline; scale for edge _bw_gbs = 2000 if _ctx == "cloud" else 102 _total_flops = _total_compute_us * 1e-6 * (312e12 if _ctx == "cloud" else 100e12) / 1e6 # Simplified: memory time in μs _total_memory_us = max(50, int((_total_compute_us / _ai) * (2000 / _bw_gbs))) _total_us = _total_compute_us + _total_dispatch_us + _total_memory_us _gpu_utilization_pct = 100.0 * _total_compute_us / _total_us # ── Build stacked bar chart ─────────────────────────────────────────────── _bar_color_compute = COLORS["BlueLine"] _bar_color_dispatch = COLORS["OrangeLine"] _bar_color_memory = COLORS["GreenLine"] _fig = go.Figure() _fig.add_trace(go.Bar( name="Kernel Compute", x=["Forward Pass Breakdown"], y=[_total_compute_us / 1000], # convert to ms marker_color=_bar_color_compute, text=[f"Compute: {_total_compute_us/1000:.2f} ms"], textposition="inside", textfont=dict(color="white", size=11, family="SF Mono, monospace"), )) _fig.add_trace(go.Bar( name="Dispatch Overhead", x=["Forward Pass Breakdown"], y=[_total_dispatch_us / 1000], marker_color=_bar_color_dispatch, text=[f"Dispatch: {_total_dispatch_us/1000:.2f} ms ({_n_kernels} kernels × {_dispatch_us}μs)"], textposition="inside", textfont=dict(color="white", size=11, family="SF Mono, monospace"), )) _fig.add_trace(go.Bar( name="Memory Transfer", x=["Forward Pass Breakdown"], y=[_total_memory_us / 1000], marker_color=_bar_color_memory, text=[f"Memory: {_total_memory_us/1000:.2f} ms"], textposition="inside", textfont=dict(color="white", size=11, family="SF Mono, monospace"), )) _fig.update_layout( barmode="stack", height=320, yaxis=dict(title="Time (ms)", gridcolor="#f1f5f9", linecolor="#e2e8f0"), xaxis=dict(gridcolor="#f1f5f9", linecolor="#e2e8f0"), legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1), margin=dict(l=50, r=20, t=40, b=30), plot_bgcolor="white", paper_bgcolor="white", font=dict(family="Inter, sans-serif", color="#0f172a"), ) # ── Model comparison table ──────────────────────────────────────────────── _rows = "" for _model_name, _mp in [ ("KWS (1,000 kernels, 5 μs each)", _model_params["kws"]), ("ResNet-50 (200 kernels, 50 μs each)", _model_params["resnet50"]), ("GPT-2 Layer (20 kernels, 500 μs each)", _model_params["gpt2"]), ]: _tc = _mp["n"] * _mp["compute_us"] _td_eager = _mp["n"] * _dispatch_us _util_eager = 100.0 * _tc / (_tc + _td_eager + max(50, int((_tc / _ai_map[_model_name[:3].lower().strip()]) * (2000 / _bw_gbs) if _model_name[:3].lower().strip() in {"kws", "res", "gpt"} else 1000))) _n_compiled = int(_mp["n"] * (1 - _mp["compiled_reduction"])) _td_compiled = _n_compiled * _dispatch_us _util_compiled = 100.0 * _tc / (_tc + _td_compiled + max(50, int((_tc / _ai_map[_model_name[:3].lower().strip()]) * (2000 / _bw_gbs) if _model_name[:3].lower().strip() in {"kws", "res", "gpt"} else 1000))) _eager_color = "#008F45" if _util_eager >= 70 else ("#CC5500" if _util_eager >= 40 else "#CB202D") _comp_color = "#008F45" if _util_compiled >= 70 else ("#CC5500" if _util_compiled >= 40 else "#CB202D") _rows += f""" {_model_name} {_mp['n']} {_mp['compute_us']} μs {_dispatch_us} μs {_util_eager:.0f}% {_util_compiled:.0f}% """ _table_html = f"""
{_rows}
Model Kernels Compute/Kernel Dispatch/Kernel Eager Util % Compiled Util %
""" # ── Physics formula display ─────────────────────────────────────────────── _formula_block = f"""
Dispatch Tax Physics
GPU Utilization = N_kernels × t_compute / (N_kernels × (t_compute + t_dispatch) + t_memory)

KWS eager: {_total_compute_us:,} μs compute / ({_total_us:,} μs total) = {_gpu_utilization_pct:.1f}%
t_dispatch = {_n_kernels} kernels × {_dispatch_us} μs = {_total_dispatch_us:,} μs
t_compute = {_total_compute_us:,} μs (unchanged by compilation)
""" return mo.vstack([ mo.md(f"""### Dispatch Tax Waterfall — {act1_model_selector.value.split(' — ')[0]} · {act1_mode_toggle.value}"""), mo.ui.plotly(_fig), mo.Html(_formula_block), mo.md("**Model Comparison Table** — GPU utilization across model types and execution modes"), mo.Html(_table_html), ]) # ─── ACT 1: PREDICTION REVEAL ───────────────────────────────────────────────── @app.cell(hide_code=True) def _(mo, act1_pred, COLORS): # Correct answer is C: 33% # KWS: 1,000 × 5 μs compute = 5,000 μs total compute # 1,000 × 10 μs dispatch = 10,000 μs total dispatch # Compute fraction = 5,000 / (5,000 + 10,000 + ~500 memory) ≈ 33% _correct_pct = 33.3 _predicted_map = {"A": 90, "B": 50, "C": 33, "D": 5} _predicted_pct = _predicted_map.get(act1_pred.value, 0) _is_correct = act1_pred.value == "C" _gap_pct = abs(_correct_pct - _predicted_pct) if _is_correct: _reveal_kind = "success" _reveal_text = f"""**Your prediction was correct: ~33%.** The arithmetic: 1,000 kernels × 5 μs compute = 5,000 μs of actual GPU work; 1,000 kernels × 10 μs dispatch = 10,000 μs of overhead. GPU utilization = 5,000 / (5,000 + 10,000 + memory) ≈ **33%**. The GPU spends two-thirds of wall time waiting for kernel launches, not computing.""" else: _reveal_text = f"""**You predicted {_predicted_pct}%. The actual value is ~33%. You were off by {_gap_pct:.0f} percentage points.** The arithmetic: 1,000 kernels × 5 μs compute = 5,000 μs of GPU work; 1,000 kernels × 10 μs dispatch = 10,000 μs of overhead. GPU utilization = 5,000 / (5,000 + 10,000 + memory) ≈ **33%**. {'You overestimated — dispatch overhead dominates when kernels are small.' if _predicted_pct > _correct_pct else 'Overhead is real but compute is not zero — the ratio is 33%, not near zero.'}""" _reveal_kind = "warn" return mo.callout(mo.md(_reveal_text + """ **The key asymmetry:** A 2× faster GPU would not fix 33% utilization. It would complete the 5 μs compute in 2.5 μs — and then wait 10 μs for the next launch. Faster hardware *amplifies* the dispatch tax."""), kind=_reveal_kind) # ─── ACT 1: REFLECTION ──────────────────────────────────────────────────────── @app.cell(hide_code=True) def _(mo): act1_reflection = mo.ui.radio( options={ "A) Faster GPUs have higher power draw, which causes thermal throttling": "A", "B) Faster GPUs require more time to warm up before peak throughput": "B", "C) Faster compute reduces the compute fraction of each kernel, making the fixed dispatch overhead a larger share of total time": "C", "D) Faster GPUs use different memory hierarchies that are incompatible with small models": "D", }, label="""**Structured Reflection.** A faster GPU sometimes produces *lower* utilization for small models because:""", ) act1_reflection return (act1_reflection,) @app.cell(hide_code=True) def _(mo, act1_reflection): mo.stop( act1_reflection.value is None, mo.callout(mo.md("Select your answer to see the explanation."), kind="warn"), ) _correct = act1_reflection.value == "C" _feedback_map = { "A": "**Not quite.** Thermal throttling affects sustained throughput, but it is not the mechanism here. A faster GPU running the same dispatch-limited workload would finish the 5 μs compute faster — say in 2.5 μs — and then idle for 10 μs waiting for the next launch. Power draw is not the variable.", "B": "**Not quite.** Warm-up time affects the first few iterations (the 'cold start' problem), not steady-state utilization. Once the GPU is at operating temperature, the dispatch tax per kernel is independent of how long the GPU has been running.", "C": "**Correct.** This is the fundamental asymmetry: the dispatch tax (5–20 μs) is a CPU-side constant set by driver overhead. If the GPU computes faster, it finishes sooner but the dispatch timer does not compress. The ratio t_compute / (t_compute + t_dispatch) falls as t_compute shrinks — faster silicon, lower utilization for the same kernel structure.", "D": "**Not quite.** Memory hierarchy compatibility is not the mechanism. The same kernel code runs on all NVIDIA architectures. The dispatch overhead difference is in the OS scheduler and CUDA driver layer, not the memory subsystem.", } return mo.callout(mo.md(_feedback_map[act1_reflection.value]), kind="success" if _correct else "warn") # ─── ACT 1: MATHPEEK ────────────────────────────────────────────────────────── @app.cell(hide_code=True) def _(mo): mo.accordion({ "The governing equation": mo.md(r""" **GPU Utilization Formula:** $$\text{GPU Utilization} = \frac{N \cdot t_{compute}}{N \cdot (t_{compute} + t_{launch}) + t_{memory}}$$ - **N** — number of kernel launches per forward pass - **t\_compute** — GPU compute time per kernel (μs) — set by the operation size - **t\_launch** — CPU-side dispatch overhead per kernel (5–20 μs on Cloud; 50 μs on Edge) — *constant* - **t\_memory** — HBM transfer time for the full forward pass (μs) **Numerical check (KWS eager, Cloud):** $$\text{Utilization} = \frac{1000 \times 5}{1000 \times (5 + 10) + 500} = \frac{5000}{15500} \approx 33\%$$ **What compilation changes:** Compiled mode reduces N by 30–80% via operator fusion. Compute time is unchanged (same total FLOP count). Dispatch overhead drops in proportion to N reduction. $$\text{Utilization}_{compiled} = \frac{N \cdot t_{compute}}{(N \cdot r_{fusion}) \cdot (t_{compute} + t_{launch}) + t_{memory}}$$ where $r_{fusion} \in [0.2, 0.7]$ is the fraction of kernels remaining after fusion. """) }) return # ───────────────────────────────────────────────────────────────────────────── # ACT 2: THE COMPILATION BREAK-EVEN # ───────────────────────────────────────────────────────────────────────────── @app.cell(hide_code=True) def _(mo, COLORS, context_toggle): _ctx = context_toggle.value _ctx_label = "Cloud (A100)" if _ctx == "cloud" else "Edge (Jetson Orin NX)" _ctx_color = COLORS["Cloud"] if _ctx == "cloud" else COLORS["Edge"] _volume_default = "10M requests/day" if _ctx == "cloud" else "100 requests/hour" mo.vstack([ mo.md("---"), mo.Html(f"""
Act 2 · 22 min
Context: {_ctx_label} · Default volume: {_volume_default}
"""), mo.md("## The Compilation Break-Even"), mo.callout(mo.md("""**Design Challenge:** You have a production serving system. You must decide: run eagerly (flexible, no compile cost) or compile (latency to compile, permanent throughput gain). The break-even analysis determines whether compilation is net-positive for your deployment. A wrong decision costs either throughput or deployment time."""), kind="info"), ]) return # ─── ACT 2: PREDICTION LOCK ─────────────────────────────────────────────────── @app.cell(hide_code=True) def _(mo, act1_reflection): mo.stop( act1_reflection.value is None, mo.callout(mo.md("Complete the Act 1 reflection above to unlock Act 2."), kind="warn"), ) act2_pred = mo.ui.radio( options={ "A) About 1,000 images — the overhead is tiny": "A", "B) About 10,000 images — roughly 10 seconds of inference at baseline throughput": "B", "C) About 130,000 images — the time saved per image is small, so many images are needed": "C", "D) About 10 million images — compilation is almost never worth it": "D", }, label="""**Commit your prediction before unlocking the Act 2 instruments.** torch.compile on ResNet-50 improves throughput by 48% (from 1,450 to 2,150 images/sec) but requires 30 seconds of one-time compilation. Approximately how many images must you process before the compilation time cost is recovered?""", ) mo.vstack([ act2_pred, mo.callout(mo.md("Select your prediction to unlock the compilation instruments."), kind="warn") if act2_pred.value is None else mo.md(""), ]) return (act2_pred,) # ─── ACT 2: GATE ────────────────────────────────────────────────────────────── @app.cell(hide_code=True) def _(mo, act2_pred): mo.stop( act2_pred.value is None, mo.callout(mo.md("Select your Act 2 prediction above to unlock the compilation instruments."), kind="warn"), ) return # ─── ACT 2: PANEL A — KERNEL FUSION EXPLORER ────────────────────────────────── @app.cell(hide_code=True) def _(mo): mo.md("""### Panel A: Kernel Fusion Explorer **Operation sequence:** LayerNorm → Dropout → ReLU Without fusion, these three element-wise operations each read their input from HBM, compute, and write their output back to HBM. With fusion, the input is read once, all three operations execute in registers/L1, and only the final output is written. """) return @app.cell(hide_code=True) def _(mo): act2_fusion_toggle = mo.ui.radio( options={ "Unfused (3 separate kernel launches)": "unfused", "Fused (1 kernel launch)": "fused", }, value="Unfused (3 separate kernel launches)", label="Fusion state", inline=True, ) act2_hbm_bw_selector = mo.ui.dropdown( options={ "1.0 TB/s (older A100 equivalent)": 1000, "2.0 TB/s (A100 HBM2e)": 2000, "3.35 TB/s (H100 HBM3)": 3350, }, value="2.0 TB/s (A100 HBM2e)", label="HBM bandwidth", ) mo.hstack([ mo.vstack([mo.md("**Fusion state**"), act2_fusion_toggle]), mo.vstack([mo.md("**HBM bandwidth**"), act2_hbm_bw_selector]), ], justify="start", gap=3) return (act2_fusion_toggle, act2_hbm_bw_selector) @app.cell(hide_code=True) def _( mo, go, act2_fusion_toggle, act2_hbm_bw_selector, context_toggle, COLORS, apply_plotly_theme, DISPATCH_US_CLOUD, DISPATCH_US_EDGE, ): _ctx = context_toggle.value _dispatch_us = DISPATCH_US_CLOUD if _ctx == "cloud" else DISPATCH_US_EDGE _bw_gbs = act2_hbm_bw_selector.value _is_fused = act2_fusion_toggle.value == "fused" # ── Fusion physics (traceability: frameworks.qmd §4, lab plan §4 Panel A) ─ # LayerNorm + Dropout + ReLU fusion: 3 element-wise ops # Unfused: 3 reads + 3 writes = 6 HBM ops per tensor element # Fused: 1 read + 1 write = 2 HBM ops per tensor element # HBM traffic ratio: 6/2 = 3× reduction — frameworks.qmd plan (derived, not FlashAttention) # FlashAttention provides 10–20× which is tiling of attention matrix — different mechanism # Representative tensor size: 1024 tokens × 768 hidden = 786,432 elements × 2 bytes (FP16) _tensor_bytes = 1024 * 768 * 2 # bytes per tensor transfer _n_ops_unfused = 3 # LayerNorm, Dropout, ReLU # HBM traffic _hbm_ops_unfused = 2 * _n_ops_unfused # 3 reads + 3 writes _hbm_ops_fused = 2 # 1 read + 1 write _hbm_traffic_unfused_mb = (_hbm_ops_unfused * _tensor_bytes) / 1e6 _hbm_traffic_fused_mb = (_hbm_ops_fused * _tensor_bytes) / 1e6 # Transfer time in μs _bw_bytes_per_us = _bw_gbs * 1000 # GB/s → MB/μs → bytes/μs = _bw_gbs * 1e9 / 1e6 _t_mem_unfused_us = (_hbm_traffic_unfused_mb * 1e6) / (_bw_gbs * 1e9 / 1e6) _t_mem_fused_us = (_hbm_traffic_fused_mb * 1e6) / (_bw_gbs * 1e9 / 1e6) # Kernel launch overhead _t_dispatch_unfused_us = _n_ops_unfused * _dispatch_us _t_dispatch_fused_us = 1 * _dispatch_us # Compute time (same total work, 3 element-wise ops — fixed) _t_compute_us = 5 # μs for all 3 ops combined (small element-wise) # Total times _total_unfused_us = _t_compute_us + _t_dispatch_unfused_us + _t_mem_unfused_us _total_fused_us = _t_compute_us + _t_dispatch_fused_us + _t_mem_fused_us _speedup = _total_unfused_us / _total_fused_us if _total_fused_us > 0 else 1.0 _traffic_reduction = _hbm_ops_unfused / _hbm_ops_fused # = 3× # Arithmetic intensity _compute_flops = 1024 * 768 * 10 # ~10 FLOP per element for all 3 ops combined _ai_unfused = _compute_flops / (_hbm_traffic_unfused_mb * 1e6) _ai_fused = _compute_flops / (_hbm_traffic_fused_mb * 1e6) _active_state = "fused" if _is_fused else "unfused" _active_total = _total_fused_us if _is_fused else _total_unfused_us _active_mem = _t_mem_fused_us if _is_fused else _t_mem_unfused_us _active_dispatch = _t_dispatch_fused_us if _is_fused else _t_dispatch_unfused_us _active_traffic = _hbm_traffic_fused_mb if _is_fused else _hbm_traffic_unfused_mb _active_ai = _ai_fused if _is_fused else _ai_unfused # ── Bar chart: memory traffic comparison ──────────────────────────────── _fig_fusion = go.Figure() _active_color_un = COLORS["OrangeLine"] if not _is_fused else COLORS["Grey"] _active_color_f = COLORS["GreenLine"] if _is_fused else COLORS["Grey"] _fig_fusion.add_trace(go.Bar( name="Unfused (3 kernels)", x=["HBM Reads", "HBM Writes", "Dispatch Overhead"], y=[ _hbm_traffic_unfused_mb / 2, # reads only _hbm_traffic_unfused_mb / 2, # writes only _t_dispatch_unfused_us / 100, # scale for visibility ], marker_color=COLORS["OrangeLine"], opacity=0.5 if _is_fused else 1.0, )) _fig_fusion.add_trace(go.Bar( name="Fused (1 kernel)", x=["HBM Reads", "HBM Writes", "Dispatch Overhead"], y=[ _hbm_traffic_fused_mb / 2, _hbm_traffic_fused_mb / 2, _t_dispatch_fused_us / 100, ], marker_color=COLORS["GreenLine"], opacity=0.5 if not _is_fused else 1.0, )) _fig_fusion.update_layout( barmode="group", height=280, yaxis=dict(title="MB (reads/writes) / scaled overhead", gridcolor="#f1f5f9"), xaxis=dict(gridcolor="#f1f5f9"), legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1), margin=dict(l=50, r=20, t=40, b=30), plot_bgcolor="white", paper_bgcolor="white", font=dict(family="Inter, sans-serif", color="#0f172a"), ) # ── Results panel ──────────────────────────────────────────────────────── _result_color = COLORS["GreenLine"] if _speedup >= 3.0 else (COLORS["OrangeLine"] if _speedup >= 1.5 else COLORS["RedLine"]) _ai_color = COLORS["OrangeLine"] # still memory-bound below ridge point (156 FLOP/byte) _panel_a_results = f"""
HBM Traffic
{_active_traffic:.2f} MB
{"1 read + 1 write" if _is_fused else "3 reads + 3 writes"}
Traffic Reduction
{_traffic_reduction:.0f}×
vs unfused baseline
Wall-Clock Speedup
{_speedup:.1f}×
fused vs unfused
Arithmetic Intensity
{_active_ai:.2f} FLOP/B
{"Still memory-bound (ridge = 156 FLOP/B)" if _active_ai < 156 else "Compute-bound"}
""" _fusion_formula = f"""
Fusion Physics (LayerNorm + Dropout + ReLU)
Unfused: {_n_ops_unfused} reads + {_n_ops_unfused} writes = {_hbm_ops_unfused} HBM ops → {_hbm_traffic_unfused_mb:.2f} MB
Fused: 1 read + 1 write = {_hbm_ops_fused} HBM ops → {_hbm_traffic_fused_mb:.2f} MB
Traffic reduction: {_hbm_ops_unfused}/{_hbm_ops_fused} = {_traffic_reduction:.0f}× less HBM traffic
Wall-clock speedup: {_total_unfused_us:.1f} μs → {_total_fused_us:.1f} μs = {_speedup:.1f}× faster
Arithmetic intensity: {_ai_unfused:.3f} → {_ai_fused:.3f} FLOP/byte (still below ridge point of 156 FLOP/B)
""" return mo.vstack([ mo.ui.plotly(_fig_fusion), mo.Html(_panel_a_results), mo.Html(_fusion_formula), mo.callout(mo.md(f"""**Key distinction:** The 3× HBM traffic reduction applies to *element-wise fusion* (LayerNorm + Dropout + ReLU — 3 ops → 1 op). The 10–20× figure cited in @sec-ml-frameworks applies to **FlashAttention** specifically, which tiles the attention matrix to avoid materializing the full N×N attention scores in HBM. These are different mechanisms. Element-wise fusion is proportional to the number of ops fused; FlashAttention's gain comes from tiling mathematics."""), kind="info"), ]) # ─── ACT 2: PANEL B — COMPILATION ROI CALCULATOR ────────────────────────────── @app.cell(hide_code=True) def _(mo): mo.md("""### Panel B: Compilation ROI Calculator Compilation has a one-time cost and a permanent throughput gain. Whether it is net-positive depends on how many inferences you run before the deployment ends. """) return @app.cell(hide_code=True) def _(mo, context_toggle): _ctx = context_toggle.value # Default values reflect the two deployment contexts # Cloud: high volume, compilation always pays off quickly # Edge: low volume (100 req/hr), short deployments _default_compile_s = 30 # ResNet-50 default _default_gain_pct = 48 # ResNet-50 default from chapter _default_volume = 10000000 if _ctx == "cloud" else 100 _default_duration_days = 30 if _ctx == "cloud" else 1 act2_compile_time_slider = mo.ui.slider( start=5, stop=300, step=5, value=_default_compile_s, label="Compilation time (seconds)", full_width=True, ) act2_gain_pct_slider = mo.ui.slider( start=5, stop=100, step=5, value=_default_gain_pct, label="Throughput gain from compilation (%)", full_width=True, ) act2_volume_slider = mo.ui.slider( start=100, stop=10000000, step=100, value=_default_volume, label="Inferences per day", full_width=True, ) act2_duration_slider = mo.ui.slider( start=1, stop=365, step=1, value=_default_duration_days, label="Deployment duration (days)", full_width=True, ) mo.vstack([ mo.hstack([act2_compile_time_slider, act2_gain_pct_slider], justify="start", gap=2), mo.hstack([act2_volume_slider, act2_duration_slider], justify="start", gap=2), ]) return ( act2_compile_time_slider, act2_gain_pct_slider, act2_volume_slider, act2_duration_slider, ) # ─── ACT 2: BREAK-EVEN CHART + FAILURE STATE ───────────────────────────────── @app.cell(hide_code=True) def _( mo, go, np, act2_compile_time_slider, act2_gain_pct_slider, act2_volume_slider, act2_duration_slider, context_toggle, COLORS, apply_plotly_theme, RESNET50_THROUGHPUT_EAGER, RESNET50_THROUGHPUT_COMPILED, ): _ctx = context_toggle.value _compile_s = act2_compile_time_slider.value _gain_pct = act2_gain_pct_slider.value / 100.0 _volume_per_day = act2_volume_slider.value _duration_days = act2_duration_slider.value # ── Break-even calculation ──────────────────────────────────────────────── # frameworks.qmd line ~1283: ResNet-50 baseline 1,450 img/sec; compiled 2,150 img/sec # User-configurable gain applied to baseline _throughput_eager = RESNET50_THROUGHPUT_EAGER _throughput_compiled = _throughput_eager * (1 + _gain_pct) # Time saved per image (seconds) _t_per_img_eager = 1.0 / _throughput_eager _t_per_img_compiled = 1.0 / _throughput_compiled _delta_t_per_img = _t_per_img_eager - _t_per_img_compiled # seconds saved per image # Break-even: how many images until compile_time is recovered # N_breakeven = t_compile / delta_t_per_image _breakeven_images = int(_compile_s / _delta_t_per_img) if _delta_t_per_img > 0 else float('inf') _breakeven_days = _breakeven_images / _volume_per_day if _volume_per_day > 0 else float('inf') _total_images = _volume_per_day * _duration_days _roi_positive = _total_images >= _breakeven_images # ── Build break-even timeline ──────────────────────────────────────────── # X-axis: days into deployment # Y-axis: cumulative time saved (seconds) minus compile overhead _days = np.linspace(0, _duration_days, 500) _images_elapsed = _days * _volume_per_day _cumulative_saved_s = _images_elapsed * _delta_t_per_img - _compile_s _compile_overhead_line = np.zeros_like(_days) _fig_roi = go.Figure() # Compilation overhead baseline _fig_roi.add_trace(go.Scatter( x=_days, y=_compile_overhead_line, name="Break-even line", line=dict(color="#94a3b8", width=1.5, dash="dot"), showlegend=True, )) # Cumulative time saved _line_color = COLORS["GreenLine"] if _roi_positive else COLORS["OrangeLine"] _fig_roi.add_trace(go.Scatter( x=_days, y=_cumulative_saved_s, name="Cumulative time saved", fill="tozeroy", fillcolor=f"rgba(0, 143, 69, 0.12)" if _roi_positive else f"rgba(204, 85, 0, 0.12)", line=dict(color=_line_color, width=2.5), )) # Mark break-even point if within deployment if _breakeven_days <= _duration_days and _breakeven_days > 0: _fig_roi.add_vline( x=_breakeven_days, line_color=COLORS["BlueLine"], line_dash="dash", line_width=2, annotation_text=f"Break-even: day {_breakeven_days:.1f}", annotation_position="top right", annotation_font_color=COLORS["BlueLine"], annotation_font_size=11, ) _fig_roi.update_layout( height=320, yaxis=dict( title="Cumulative time saved (seconds)", gridcolor="#f1f5f9", linecolor="#e2e8f0", zeroline=True, zerolinecolor="#e2e8f0", zerolinewidth=1, ), xaxis=dict(title="Days deployed", gridcolor="#f1f5f9", linecolor="#e2e8f0"), legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1), margin=dict(l=60, r=20, t=40, b=40), plot_bgcolor="white", paper_bgcolor="white", font=dict(family="Inter, sans-serif", color="#0f172a"), ) # ── Metric cards ──────────────────────────────────────────────────────── _be_display = f"{_breakeven_images:,.0f}" if _breakeven_images != float('inf') else "∞" _be_days_display = f"{_breakeven_days:.1f}" if _breakeven_days != float('inf') else "∞" _total_saved_s = max(0, _cumulative_saved_s[-1]) _card_color_be = COLORS["GreenLine"] if _roi_positive else COLORS["OrangeLine"] _card_color_saved = COLORS["GreenLine"] if _total_saved_s > 0 else COLORS["RedLine"] _metrics_html = f"""
Break-Even Images
{_be_display}
images needed
Break-Even Day
{_be_days_display}
of {_duration_days}-day deployment
Total Images Served
{_total_images:,.0f}
over deployment
Net Time Saved
{_total_saved_s:,.0f}s
after compile cost
""" # ── FAILURE STATE ───────────────────────────────────────────────────────── # Trigger: break-even > deployment duration (compilation not justified) if not _roi_positive: _failure_banner = mo.callout( mo.md( f"**Compilation Not Justified.** " f"Break-even requires **{_be_display} images** (day {_be_days_display}), " f"but your deployment ends after **day {_duration_days}** " f"({_total_images:,.0f} total images). " f"Eager mode is faster overall for this deployment window. " f"To fix: increase deployment duration, inference volume, or throughput gain — " f"or use eager mode and accept the {_gain_pct*100:.0f}% throughput cost." ), kind="danger", ) else: _net_days_positive = _duration_days - _breakeven_days _failure_banner = mo.callout( mo.md( f"**Compilation ROI Positive.** " f"Break-even at day {_be_days_display} leaves " f"**{_net_days_positive:.1f} days** of net-positive throughput. " f"Total time saved after compile cost: **{_total_saved_s:,.0f} seconds**." ), kind="success", ) _formula_text = f"""
Break-Even Formula (frameworks.qmd §Compilation)
N_breakeven = t_compile / (1/R_eager - 1/R_compiled)
= {_compile_s}s / (1/{_throughput_eager:.0f} - 1/{_throughput_compiled:.0f})
= {_compile_s}s / {_delta_t_per_img*1000:.4f} ms/image
= {_be_display} images = day {_be_days_display} at {_volume_per_day:,} images/day
""" return mo.vstack([ mo.ui.plotly(_fig_roi), mo.Html(_metrics_html), _failure_banner, mo.Html(_formula_text), ]) # ─── ACT 2: PREDICTION REVEAL ───────────────────────────────────────────────── @app.cell(hide_code=True) def _(mo, act2_pred, RESNET50_BREAKEVEN_IMAGES): _correct_images = RESNET50_BREAKEVEN_IMAGES # ~134,000 _predicted_map = {"A": 1000, "B": 10000, "C": 130000, "D": 10000000} _predicted_images = _predicted_map.get(act2_pred.value, 0) _is_correct = act2_pred.value == "C" _ratio = _correct_images / _predicted_images if _predicted_images > 0 else float('inf') if _is_correct: _kind = "success" _text = f"**Your prediction was correct: ~130,000 images.** The calculation: Δt per image = 1/1,450 − 1/2,150 ≈ 0.224 ms. Break-even = 30 s / 0.000224 s/image ≈ **{_correct_images:,} images**. The gain per image is small because the base throughput is already fast — you need high volume to recover the fixed compile cost." elif _ratio > 5: _kind = "warn" _text = f"**You predicted {_predicted_images:,}. The actual value is ~{_correct_images:,}. You were off by {_ratio:.1f}×.** The gain per image is only 0.224 ms — much smaller than it feels. A 48% throughput gain sounds large, but when you are processing at 1,450 img/sec, each image takes just 0.69 ms. Saving 0.224 ms per image requires processing 134,000 images to recover 30 seconds of compile time." else: _kind = "warn" _text = f"**You predicted {_predicted_images:,}. The actual value is ~{_correct_images:,}. You were off by {_ratio:.1f}×.** {'Compilation requires more volume than expected because the per-image gain is small relative to the compile cost.' if _predicted_images < _correct_images else 'Compilation is more cost-effective than you expected — the per-image gain accumulates quickly at high throughput.'}" return mo.callout(mo.md(_text), kind=_kind) # ─── ACT 2: REFLECTION ──────────────────────────────────────────────────────── @app.cell(hide_code=True) def _(mo): act2_reflection_a = mo.ui.radio( options={ "A) ~5× wall-clock speedup only — no HBM traffic change": "A", "B) ~3× HBM traffic reduction and ~5× wall-clock speedup": "B", "C) 10–20× HBM traffic reduction (same as FlashAttention)": "C", "D) ~3× HBM traffic reduction only — no wall-clock change": "D", }, label="""**Structured Reflection — Part 1.** Kernel fusion of LayerNorm + Dropout + ReLU (3 element-wise ops → 1 fused kernel) provides:""", ) act2_reflection_a return (act2_reflection_a,) @app.cell(hide_code=True) def _(mo, act2_reflection_a): mo.stop( act2_reflection_a.value is None, mo.callout(mo.md("Select your answer to continue."), kind="warn"), ) _correct_a = act2_reflection_a.value == "B" _fb_a_map = { "A": "**Not quite.** The wall-clock speedup (approximately 5×) comes from *two* combined effects: eliminating 2 of 3 kernel launch overheads (dispatch reduction) AND eliminating intermediate HBM writes and reads (memory traffic reduction). You cannot get the full speedup without both. The 5× figure without the 3× HBM reduction is not accurate.", "B": "**Correct.** Fusing 3 element-wise ops into 1 eliminates 2 intermediate HBM read-write pairs: 6 HBM operations (3 reads + 3 writes) reduce to 2 (1 read + 1 write) = **3× HBM traffic reduction**. Combined with eliminating 2 of 3 kernel launch overheads, the total wall-clock speedup is approximately **5×** — as reported in @sec-ml-frameworks.", "C": "**Not quite.** The 10–20× figure applies to **FlashAttention**, which tiles the N×N attention matrix to avoid materializing it in HBM entirely. That is a different mechanism — tiling reduces quadratic memory cost to linear. Element-wise op fusion (LayerNorm, Dropout, ReLU) reduces traffic proportionally to the number of ops fused: 3 ops = 3× reduction. Different operations, different physics.", "D": "**Not quite.** Both effects occur simultaneously. Eliminating intermediate HBM traffic (3× reduction) speeds up the memory-bound portion. Eliminating 2 of 3 kernel launches reduces dispatch overhead. Together these produce the ~5× wall-clock speedup — you cannot separate them in practice.", } return mo.callout(mo.md(_fb_a_map[act2_reflection_a.value]), kind="success" if _correct_a else "warn") @app.cell(hide_code=True) def _(mo, act2_reflection_a): mo.stop(act2_reflection_a.value is None) act2_reflection_b = mo.ui.radio( options={ "A) Near-zero speedup — dispatch overhead (10 μs) is negligible vs. 200 ms compute": "A", "B) About 48% speedup — the same as ResNet-50": "B", "C) Greater than 2× speedup — large kernels benefit most from optimization": "C", "D) Negative speedup — compilation makes large kernels slower": "D", }, label="""**Structured Reflection — Part 2.** For a model with one giant matrix multiply (200 ms compute per kernel, 1 kernel total), torch.compile will provide:""", ) act2_reflection_b return (act2_reflection_b,) @app.cell(hide_code=True) def _(mo, act2_reflection_b): mo.stop( act2_reflection_b.value is None, mo.callout(mo.md("Select your answer to see the explanation."), kind="warn"), ) _correct_b = act2_reflection_b.value == "A" _fb_b_map = { "A": "**Correct.** The dispatch tax for 1 kernel at 10 μs is 10 μs / (10 μs + 200,000 μs) = **0.005%** of total time. Compilation fuses zero ops (there is only one kernel) and eliminates essentially zero dispatch overhead. For large matmul-dominant workloads like GPT-2 forward passes, compilation provides marginal gains — the hardware is already efficient at that scale.", "B": "**Not quite.** The 48% figure applies to ResNet-50, which has approximately 200 medium-sized kernels where dispatch overhead is a meaningful fraction of total time. A single 200 ms kernel has a dispatch ratio of 10 μs / 200 ms = 0.005%. Compilation cannot exploit what is not fragmented.", "C": "**Not quite.** Large kernels benefit *least* from compilation because the dispatch tax is already negligible. The 5–20 μs overhead per launch is fixed by CPU driver latency — it does not scale with kernel size. When compute time is 200 ms, the dispatch component is invisible.", "D": "**Not quite.** Compilation does not make correct kernels slower (it may add negligible warm-up cost). The answer is near-zero speedup, not negative. Compilation at worst adds trace overhead; the 30-second compile time has no effect on per-call performance after the first run.", } return mo.callout(mo.md(_fb_b_map[act2_reflection_b.value]), kind="success" if _correct_b else "warn") # ─── ACT 2: MATHPEEK ────────────────────────────────────────────────────────── @app.cell(hide_code=True) def _(mo): mo.accordion({ "The governing equations": mo.md(r""" **Total Forward Pass Time:** $$T_{total} = N_{kernels} \cdot t_{dispatch} + t_{compute} + t_{memory}$$ - **N\_kernels** — kernel launches per forward pass (reduced by compilation via fusion) - **t\_dispatch** — CPU-side overhead per launch: 5–20 μs (Cloud); 50 μs (Edge) - **t\_compute** — GPU arithmetic time (unchanged by compilation) - **t\_memory** — HBM transfer time (reduced by kernel fusion via fewer intermediate tensors) **Compilation Break-Even:** $$N_{break-even} = \frac{t_{compile}}{\Delta t_{per\text{-}inference}} = \frac{t_{compile}}{\frac{1}{R_{eager}} - \frac{1}{R_{compiled}}}$$ **ResNet-50 numerical check (frameworks.qmd §Compilation):** $$N_{break-even} = \frac{30\text{ s}}{\frac{1}{1450} - \frac{1}{2150}} \approx 134{,}000 \text{ images}$$ **Element-wise fusion HBM traffic reduction:** $$\frac{\text{HBM ops}_{unfused}}{\text{HBM ops}_{fused}} = \frac{2N_{ops}}{2} = N_{ops}\text{-fold reduction}$$ For LayerNorm + Dropout + ReLU ($N_{ops} = 3$): **3× HBM traffic reduction**, **~5× wall-clock speedup** (includes dispatch elimination). *Note: FlashAttention's 10–20× reduction comes from attention matrix tiling ($O(N^2) \to O(N)$ HBM access) — a different mechanism from element-wise fusion.* """) }) return # ───────────────────────────────────────────────────────────────────────────── # DESIGN LEDGER SAVE + HUD FOOTER # ───────────────────────────────────────────────────────────────────────────── @app.cell(hide_code=True) def _( mo, ledger, COLORS, act1_pred, act1_reflection, act2_pred, act2_reflection_a, act2_reflection_b, act2_fusion_toggle, act2_compile_time_slider, act2_volume_slider, act2_duration_slider, context_toggle, RESNET50_THROUGHPUT_EAGER, RESNET50_THROUGHPUT_COMPILED, RESNET50_BREAKEVEN_IMAGES, ): _ctx = context_toggle.value # Determine completion state _act1_done = act1_pred.value is not None and act1_reflection.value is not None _act2_done = ( act2_pred.value is not None and act2_reflection_a.value is not None and act2_reflection_b.value is not None ) # Compute break-even with current slider state _compile_s = act2_compile_time_slider.value _gain_pct = 0.48 # default ResNet-50 gain _throughput_eager = RESNET50_THROUGHPUT_EAGER _throughput_compiled = _throughput_eager * (1 + _gain_pct) _delta_t = (1 / _throughput_eager) - (1 / _throughput_compiled) _breakeven_images = int(_compile_s / _delta_t) if _delta_t > 0 else 999999 _total_images = act2_volume_slider.value * act2_duration_slider.value _roi_positive = _total_images >= _breakeven_images # KWS utilization values (fixed physics, for ledger record) _kws_eager_util = 33 _kws_compiled_util = 67 # Save to Design Ledger (only when both acts complete) if _act1_done and _act2_done: ledger.save( chapter=7, design={ "context": _ctx, "execution_mode": "compiled" if act2_fusion_toggle.value == "fused" else "eager", "fusion_enabled": act2_fusion_toggle.value == "fused", "compilation_roi_positive": _roi_positive, "breakeven_inferences": _breakeven_images, "kws_utilization_eager_pct": _kws_eager_util, "kws_utilization_compiled_pct": _kws_compiled_util, "act1_prediction": act1_pred.value, "act1_correct": act1_pred.value == "C", "act2_result": float(_breakeven_images), "act2_decision": "compiled" if _roi_positive else "eager", "constraint_hit": not _roi_positive, }, ) # Progress indicators _act1_indicator = f'COMPLETE' if _act1_done else f'IN PROGRESS' _act2_indicator = f'COMPLETE' if _act2_done else f'IN PROGRESS' _ctx_color = COLORS["Cloud"] if _ctx == "cloud" else COLORS["Edge"] _ctx_label = "Cloud (A100)" if _ctx == "cloud" else "Edge (Jetson Orin NX)" mo.vstack([ mo.md("---"), mo.Html(f"""
LAB 07 · Frameworks
CONTEXT {_ctx_label}
ACT 1 {_act1_indicator}
ACT 2 {_act2_indicator}
FUSION {' ENABLED' if act2_fusion_toggle.value == 'fused' else ' DISABLED'}
COMPILE ROI {' POSITIVE' if _roi_positive else ' NEGATIVE'}
BREAK-EVEN {_breakeven_images:,} imgs
KWS UTIL {_kws_eager_util}% eager → {_kws_compiled_util}% compiled
"""), mo.callout(mo.md("""**Ledger saved to chapter 7.** Lab 08 (Training) reads `kws_utilization_compiled_pct` to initialize the MFU pipeline breakdown. Lab 11 (HW Acceleration) reads `fusion_enabled` and arithmetic intensity values to set the starting position on the Roofline curve."""), kind="info") if _act1_done and _act2_done else mo.callout(mo.md("Complete both acts to save results to the Design Ledger and unlock cross-lab continuity."), kind="warn"), ]) return if __name__ == "__main__": app.run()