mirror of
https://github.com/harvard-edge/cs249r_book.git
synced 2026-04-30 09:38:38 -05:00
Add all Vol1 (labs 01-16) and Vol2 (labs 01-17) interactive Marimo labs as the first full first-pass implementation of the ML Systems curriculum labs. Each lab follows the PROTOCOL 2-Act structure (35-40 min): - Act I: Calibration with prediction lock → instruments → overlay - Act II: Design challenge with failure states and reflection Key pedagogical instruments introduced progressively: - Vol1: D·A·M Triad, Iron Law, Memory Ledger, Roofline, Amdahl's Law, Little's Law, P99 Histogram, Compression Frontier, Chouldechova theorem - Vol2: NVLink vs PCIe cliff, Bisection BW, Young-Daly T*, Parallelism Paradox, AllReduce ring vs tree, KV-cache model, Jevons Paradox, DP ε-δ tradeoff, SLO composition, Adversarial Pareto, two-volume synthesis capstone All 35 staged files pass AST syntax verification (36/36 including lab_00). Also includes: - labs/LABS_SPEC.md: authoritative sub-agent brief for all lab conventions - labs/core/style.py: expanded unified design system with semantic color tokens
1203 lines
54 KiB
Python
1203 lines
54 KiB
Python
import marimo
|
||
|
||
__generated_with = "0.19.6"
|
||
app = marimo.App(width="full")
|
||
|
||
# ─────────────────────────────────────────────────────────────────────────────
|
||
# LAB 01: THE MAGNITUDE AWAKENING
|
||
#
|
||
# Chapter: Introduction to ML Systems (@sec-introduction)
|
||
# Core Invariant: The D·A·M Triad (Data, Algorithm, Machine) and the
|
||
# 9-order-of-magnitude gap that prevents a universal software stack.
|
||
#
|
||
# 2-Act Structure (35-40 minutes):
|
||
# Act I — The Scale Blindspot (12-15 min)
|
||
# Calibrate student intuition about the H100 ↔ Cortex-M7 compute gap.
|
||
# The central question: does a 6-order-of-magnitude gap matter for
|
||
# software architecture? The answer is: yes — it forces separate stacks.
|
||
#
|
||
# Act II — The Iron Law Preview (20-25 min)
|
||
# Apply T = D/BW + O/R + L to ResNet-50 on both deployment contexts.
|
||
# The central question: which Iron Law term dominates at batch=1?
|
||
# The OOM failure state: ResNet-50 (100 MB) > Cortex-M7 (512 KB).
|
||
#
|
||
# Deployment Contexts:
|
||
# Cloud: NVIDIA H100 SXM5 (989 TFLOPs FP16, 3350 GB/s HBM3, 80 GB, 700 W)
|
||
# TinyML: Cortex-M7 (0.001 TFLOPs, 0.05 GB/s, 512 KB SRAM, 0.1 W)
|
||
#
|
||
# Design Ledger: saves chapter=1 with context, prediction accuracy, OOM trigger.
|
||
# ─────────────────────────────────────────────────────────────────────────────
|
||
|
||
|
||
# ─── CELL 0: SETUP (hide_code=False — leave visible for instructor inspection) ─
|
||
@app.cell
|
||
def _():
|
||
import marimo as mo
|
||
import sys
|
||
import math
|
||
from pathlib import Path
|
||
import plotly.graph_objects as go
|
||
import numpy as np
|
||
|
||
_root = Path(__file__).resolve().parents[2]
|
||
if str(_root) not in sys.path:
|
||
sys.path.insert(0, str(_root))
|
||
|
||
from labs.core.state import DesignLedger
|
||
from labs.core.style import COLORS, LAB_CSS, apply_plotly_theme
|
||
|
||
ledger = DesignLedger()
|
||
return COLORS, LAB_CSS, DesignLedger, apply_plotly_theme, go, ledger, math, mo, np
|
||
|
||
|
||
# ─── CELL 1: HEADER ────────────────────────────────────────────────────────────
|
||
@app.cell(hide_code=True)
|
||
def _(COLORS, LAB_CSS, mo):
|
||
_c_cloud = COLORS["Cloud"]
|
||
_c_tiny = COLORS["Tiny"]
|
||
_c_surface0 = COLORS["Surface0"]
|
||
_c_surface1 = COLORS["Surface1"]
|
||
_header = mo.Html(f"""
|
||
{LAB_CSS}
|
||
<div style="background: linear-gradient(135deg, {_c_surface0} 0%, {_c_surface1} 100%);
|
||
border-radius: 16px; padding: 32px 40px; margin-bottom: 8px;
|
||
border: 1px solid #2d3748;">
|
||
<div style="display: flex; justify-content: space-between; align-items: flex-start; flex-wrap: wrap; gap: 16px;">
|
||
<div>
|
||
<div style="font-size: 0.72rem; font-weight: 700; color: #94a3b8;
|
||
text-transform: uppercase; letter-spacing: 0.14em; margin-bottom: 8px;">
|
||
Vol 1 · Lab 01 · Introduction to ML Systems
|
||
</div>
|
||
<div style="font-size: 2.0rem; font-weight: 800; color: #f1f5f9; line-height: 1.15; margin-bottom: 10px;">
|
||
The Magnitude Awakening
|
||
</div>
|
||
<div style="font-size: 0.95rem; color: #94a3b8; max-width: 580px; line-height: 1.6;">
|
||
Nine orders of magnitude separate a cloud accelerator from a microcontroller.
|
||
This lab forces you to confront that gap quantitatively — and discover why it
|
||
makes a universal ML software stack physically impossible.
|
||
</div>
|
||
</div>
|
||
<div style="display: flex; flex-direction: column; gap: 8px; flex-shrink: 0;">
|
||
<span class="badge badge-info">D·A·M Triad</span>
|
||
<span class="badge badge-info">Iron Law T = D/BW + O/R + L</span>
|
||
<span class="badge badge-warn">35-40 minutes · 2 Acts</span>
|
||
</div>
|
||
</div>
|
||
<div style="display: flex; gap: 16px; margin-top: 20px; flex-wrap: wrap;">
|
||
<div style="background: rgba(99,102,241,0.15); border: 1px solid rgba(99,102,241,0.4);
|
||
border-radius: 8px; padding: 10px 16px; font-size: 0.82rem;">
|
||
<span style="color: {_c_cloud}; font-weight: 700;">Cloud Context</span>
|
||
<span style="color: #94a3b8;"> — NVIDIA H100 · 989 TFLOPS · 3350 GB/s · 80 GB · 700 W</span>
|
||
</div>
|
||
<div style="background: rgba(0,143,69,0.12); border: 1px solid rgba(0,143,69,0.35);
|
||
border-radius: 8px; padding: 10px 16px; font-size: 0.82rem;">
|
||
<span style="color: {_c_tiny}; font-weight: 700;">TinyML Context</span>
|
||
<span style="color: #94a3b8;"> — Cortex-M7 · 0.001 TFLOPS · 0.05 GB/s · 512 KB · 0.1 W</span>
|
||
</div>
|
||
</div>
|
||
</div>
|
||
""")
|
||
_header
|
||
return
|
||
|
||
|
||
# ─── CELL 2: RECOMMENDED READING ───────────────────────────────────────────────
|
||
@app.cell(hide_code=True)
|
||
def _(mo):
|
||
mo.callout(mo.md("""
|
||
**Recommended Reading** — Complete the following before this lab:
|
||
|
||
- **@sec-introduction-scaling-regimes** — The AI Triad and the two scaling regimes (single-node vs. distributed fleet)
|
||
- **@sec-introduction-deployment-spectrum-a38c** — The four deployment paradigms and their power/memory constraints
|
||
- **@sec-introduction-iron-law-ml-systems-c32a** — The Iron Law equation `T = D/BW + O/R + L` and its three terms
|
||
- **@sec-introduction-dam** — D·A·M taxonomy as a diagnostic lens for bottleneck analysis
|
||
"""), kind="info")
|
||
return
|
||
|
||
|
||
# ─── CELL 3: CONTEXT TOGGLE + LEDGER LOAD ──────────────────────────────────────
|
||
@app.cell(hide_code=True)
|
||
def _(COLORS, mo):
|
||
_prior_ctx = "cloud" # default — overridden by ledger if available
|
||
context_toggle = mo.ui.radio(
|
||
options={"Cloud (H100)": "cloud", "TinyML (Cortex-M7)": "tiny"},
|
||
value="Cloud (H100)",
|
||
label="Deployment context for this session:",
|
||
inline=True,
|
||
)
|
||
mo.hstack([
|
||
mo.Html(f"""
|
||
<div style="font-size:0.78rem; font-weight:700; color:{COLORS['TextMuted']};
|
||
text-transform:uppercase; letter-spacing:0.08em; margin-right:8px; padding-top:2px;">
|
||
Active Context:
|
||
</div>
|
||
"""),
|
||
context_toggle,
|
||
], justify="start", gap=0)
|
||
return (context_toggle,)
|
||
|
||
|
||
# ═══════════════════════════════════════════════════════════════════════════════
|
||
# ACT I — THE SCALE BLINDSPOT
|
||
# ═══════════════════════════════════════════════════════════════════════════════
|
||
|
||
|
||
# ─── ACT I: SECTION HEADER ─────────────────────────────────────────────────────
|
||
@app.cell(hide_code=True)
|
||
def _(mo):
|
||
mo.md("""
|
||
---
|
||
## Act I — The Scale Blindspot
|
||
*Calibration · 12-15 minutes*
|
||
""")
|
||
return
|
||
|
||
|
||
# ─── ACT I: STAKEHOLDER MESSAGE ────────────────────────────────────────────────
|
||
@app.cell(hide_code=True)
|
||
def _(COLORS, mo):
|
||
_color = COLORS["Cloud"]
|
||
_bg = COLORS["BlueL"]
|
||
mo.Html(f"""
|
||
<div style="border-left: 4px solid {_color}; background: {_bg};
|
||
border-radius: 0 10px 10px 0; padding: 16px 22px; margin: 12px 0;">
|
||
<div style="font-size: 0.72rem; font-weight: 700; color: {_color};
|
||
text-transform: uppercase; letter-spacing: 0.1em; margin-bottom: 6px;">
|
||
Incoming Message · VP of Engineering
|
||
</div>
|
||
<div style="font-style: italic; font-size: 1.0rem; color: #1e293b; line-height: 1.65;">
|
||
"We need keyword spotting on every smart doorbell we ship. Our cloud team
|
||
says just run the model on our H100 cluster — same code we use for everything else.
|
||
The microcontrollers cost $2.40 each and have 512 KB of RAM. Before we commit
|
||
to $50,000 in cloud infrastructure, can you explain why we can't use the same
|
||
software stack for both?"
|
||
</div>
|
||
</div>
|
||
""")
|
||
return
|
||
|
||
|
||
# ─── ACT I: CONCEPT FRAMING ────────────────────────────────────────────────────
|
||
@app.cell(hide_code=True)
|
||
def _(mo):
|
||
mo.md("""
|
||
The VP is asking a systems engineering question, not a machine learning question.
|
||
The answer depends on quantifying the physical gap between the two deployment targets —
|
||
not in vague terms like "the cloud is faster," but in exact orders of magnitude.
|
||
|
||
The **D·A·M Triad** (Data, Algorithm, Machine) from @sec-introduction provides the
|
||
diagnostic lens. When the *Machine* axis spans nine orders of magnitude, the
|
||
*Algorithm* and *Data* pipelines cannot be shared. The gap is not an implementation
|
||
detail — it is a physical constraint that forces architectural separation.
|
||
|
||
Before looking at any data, commit to a prediction.
|
||
""")
|
||
return
|
||
|
||
|
||
# ─── ACT I: PREDICTION LOCK ────────────────────────────────────────────────────
|
||
@app.cell(hide_code=True)
|
||
def _(mo):
|
||
mo.md("### Your Prediction")
|
||
return
|
||
|
||
|
||
@app.cell(hide_code=True)
|
||
def _(mo):
|
||
act1_pred = mo.ui.radio(
|
||
options={
|
||
"A) ~100× (roughly two orders of magnitude)": "A",
|
||
"B) ~10,000× (roughly four orders of magnitude)": "B",
|
||
"C) ~1,000,000× (roughly six orders of magnitude)": "C",
|
||
"D) ~1,000,000,000× (roughly nine orders of magnitude)": "D",
|
||
},
|
||
label="By what factor does the H100 exceed the Cortex-M7 in peak compute throughput (TFLOPS)?",
|
||
)
|
||
act1_pred
|
||
return (act1_pred,)
|
||
|
||
|
||
@app.cell(hide_code=True)
|
||
def _(act1_pred, mo):
|
||
mo.stop(
|
||
act1_pred.value is None,
|
||
mo.callout(
|
||
mo.md("Select your prediction above to unlock the Act I instruments."),
|
||
kind="warn",
|
||
),
|
||
)
|
||
mo.md("")
|
||
return
|
||
|
||
|
||
# ─── ACT I: INSTRUMENT — 4-WAY COMPARISON BAR CHART (LOG SCALE) ────────────────
|
||
@app.cell(hide_code=True)
|
||
def _(mo):
|
||
mo.md("### The Magnitude Landscape")
|
||
return
|
||
|
||
|
||
@app.cell(hide_code=True)
|
||
def _(COLORS, apply_plotly_theme, context_toggle, go, math, mo, np):
|
||
# ── Hardware constants (source: @sec-introduction-deployment-spectrum-a38c,
|
||
# NVIDIA H100 SXM5 spec sheet, ARM Cortex-M7 TRM, Jetson Orin NX spec)
|
||
#
|
||
# Cloud — NVIDIA H100 SXM5 (Hopper, 2022)
|
||
H100_TFLOPS = 989.0 # TFLOPs FP16 Tensor Core; source: NVIDIA H100 Data Sheet
|
||
H100_BW_GBS = 3350.0 # GB/s HBM3; source: NVIDIA H100 Data Sheet
|
||
H100_RAM_GB = 80.0 # GB HBM3e
|
||
H100_TDP_W = 700.0 # Watts; SXM variant
|
||
H100_TFLOPS_MFLOPS = H100_TFLOPS * 1e6 # convert to MFLOPS for ratio
|
||
|
||
# Edge — NVIDIA Jetson Orin NX (2023)
|
||
ORIN_TFLOPS = 100.0 # TFLOPs INT8 equivalent; source: Jetson Orin NX spec
|
||
ORIN_BW_GBS = 102.0 # GB/s
|
||
ORIN_RAM_GB = 16.0 # GB
|
||
ORIN_TDP_W = 25.0 # Watts
|
||
|
||
# Mobile — Apple A17-class NPU (2023, representative smartphone)
|
||
MOBILE_TOPS = 35.0 # TOPS INT8; source: @sec-introduction-deployment-spectrum-a38c
|
||
MOBILE_BW_GBS = 68.0 # GB/s
|
||
MOBILE_RAM_GB = 8.0 # GB
|
||
MOBILE_TDP_W = 5.0 # Watts sustained
|
||
|
||
# TinyML — ARM Cortex-M7 (representative MCU, e.g. STM32H7 class)
|
||
MCU_TFLOPS = 0.001 # TFLOPs (~1 GFLOPS FP32 DSP); source: hardware.py Tiny.Generic_MCU
|
||
MCU_BW_GBS = 0.05 # GB/s; source: hardware.py Tiny.Generic_MCU
|
||
MCU_SRAM_MB = 0.512 # MB (512 KB); source: constants.py MCU_RAM_KIB
|
||
MCU_TDP_W = 0.1 # Watts; source: @sec-introduction-deployment-spectrum-a38c
|
||
|
||
# ── Regime labels and colors
|
||
_regimes = ["TinyML\n(Cortex-M7)", "Mobile\n(Smartphone NPU)", "Edge\n(Jetson Orin NX)", "Cloud\n(H100)"]
|
||
_colors_bar = [COLORS["Tiny"], COLORS["Mobile"], COLORS["Edge"], COLORS["Cloud"]]
|
||
|
||
# ── Highlight the selected context
|
||
_ctx = context_toggle.value
|
||
_bar_opacities = []
|
||
for _r in ["tiny", "mobile", "edge", "cloud"]:
|
||
if _r == _ctx:
|
||
_bar_opacities.append(1.0)
|
||
else:
|
||
_bar_opacities.append(0.35)
|
||
|
||
_highlight_colors = []
|
||
for _i, (_c, _o) in enumerate(zip(_colors_bar, _bar_opacities)):
|
||
_highlight_colors.append(_c if _o == 1.0 else "#94a3b8")
|
||
|
||
# ── Compute (TFLOPs) — log scale
|
||
_compute = [MCU_TFLOPS, MOBILE_TOPS, ORIN_TFLOPS, H100_TFLOPS]
|
||
_compute_log = [math.log10(v) for v in _compute]
|
||
|
||
# ── Memory BW (GB/s) — log scale
|
||
_bw = [MCU_BW_GBS, MOBILE_BW_GBS, ORIN_BW_GBS, H100_BW_GBS]
|
||
_bw_log = [math.log10(v) for v in _bw]
|
||
|
||
# ── Power (W) — log scale
|
||
_power = [MCU_TDP_W, MOBILE_TDP_W, ORIN_TDP_W, H100_TDP_W]
|
||
_power_log = [math.log10(v) for v in _power]
|
||
|
||
# ── Build figure with subplots
|
||
from plotly.subplots import make_subplots
|
||
_fig = make_subplots(
|
||
rows=1, cols=3,
|
||
subplot_titles=["Peak Compute (TFLOPs)", "Memory Bandwidth (GB/s)", "Power Budget (W)"],
|
||
horizontal_spacing=0.12,
|
||
)
|
||
|
||
_reg_display = ["TinyML", "Mobile", "Edge", "Cloud"]
|
||
|
||
for _col, (_log_vals, _raw_vals, _fmt_unit) in enumerate(
|
||
zip(
|
||
[_compute_log, _bw_log, _power_log],
|
||
[_compute, _bw, _power],
|
||
["TFLOPS", "GB/s", "W"],
|
||
),
|
||
start=1,
|
||
):
|
||
_bar_colors_col = []
|
||
for _j in range(4):
|
||
_bar_colors_col.append(_colors_bar[_j] if _bar_opacities[_j] == 1.0 else "#c7cdd4")
|
||
|
||
_hover = [f"{_reg_display[_j]}: {_raw_vals[_j]:,.3g} {_fmt_unit}<br>log₁₀ = {_log_vals[_j]:.1f}" for _j in range(4)]
|
||
|
||
_fig.add_trace(
|
||
go.Bar(
|
||
x=_reg_display,
|
||
y=_log_vals,
|
||
marker_color=_bar_colors_col,
|
||
text=[f"10^{v:.0f}" for v in _log_vals],
|
||
textposition="outside",
|
||
hovertext=_hover,
|
||
hoverinfo="text",
|
||
showlegend=False,
|
||
),
|
||
row=1, col=_col,
|
||
)
|
||
|
||
_fig.update_layout(
|
||
height=380,
|
||
title_text="",
|
||
margin=dict(t=50, b=60, l=40, r=20),
|
||
)
|
||
_fig.update_yaxes(
|
||
title_text="log₁₀ (value)",
|
||
range=[-3.5, 4.0],
|
||
tickvals=[-3, -2, -1, 0, 1, 2, 3],
|
||
ticktext=["10⁻³", "10⁻²", "10⁻¹", "10⁰", "10¹", "10²", "10³"],
|
||
)
|
||
|
||
apply_plotly_theme(_fig)
|
||
mo.ui.plotly(_fig)
|
||
return (
|
||
H100_BW_GBS,
|
||
H100_RAM_GB,
|
||
H100_TFLOPS,
|
||
H100_TDP_W,
|
||
MCU_BW_GBS,
|
||
MCU_SRAM_MB,
|
||
MCU_TFLOPS,
|
||
MCU_TDP_W,
|
||
MOBILE_BW_GBS,
|
||
MOBILE_RAM_GB,
|
||
MOBILE_TDP_W,
|
||
MOBILE_TOPS,
|
||
ORIN_BW_GBS,
|
||
ORIN_RAM_GB,
|
||
ORIN_TDP_W,
|
||
ORIN_TFLOPS,
|
||
)
|
||
|
||
|
||
# ─── ACT I: QUANTITATIVE GAP TABLE ─────────────────────────────────────────────
|
||
@app.cell(hide_code=True)
|
||
def _(
|
||
COLORS,
|
||
H100_BW_GBS,
|
||
H100_RAM_GB,
|
||
H100_TFLOPS,
|
||
H100_TDP_W,
|
||
MCU_BW_GBS,
|
||
MCU_SRAM_MB,
|
||
MCU_TFLOPS,
|
||
MCU_TDP_W,
|
||
mo,
|
||
):
|
||
_c_cloud = COLORS["Cloud"]
|
||
_c_tiny = COLORS["Tiny"]
|
||
_c_red = COLORS["RedLine"]
|
||
_c_border = COLORS["Border"]
|
||
|
||
_compute_ratio = H100_TFLOPS / MCU_TFLOPS
|
||
_bw_ratio = H100_BW_GBS / MCU_BW_GBS
|
||
_ram_ratio_num = (H100_RAM_GB * 1e3) / MCU_SRAM_MB # both in MB
|
||
_power_ratio = H100_TDP_W / MCU_TDP_W
|
||
|
||
def _ratio_badge(r):
|
||
if r >= 1e6:
|
||
return f'<span style="color:{_c_red}; font-weight:800;">{r:,.0f}×</span>'
|
||
elif r >= 1e4:
|
||
return f'<span style="color:#CC5500; font-weight:700;">{r:,.0f}×</span>'
|
||
else:
|
||
return f'<span style="color:{COLORS["BlueLine"]}; font-weight:700;">{r:,.0f}×</span>'
|
||
|
||
mo.Html(f"""
|
||
<div class="lab-card" style="margin: 8px 0;">
|
||
<table style="width:100%; border-collapse:collapse; font-size:0.88rem;">
|
||
<thead>
|
||
<tr style="border-bottom:2px solid {_c_border};">
|
||
<th style="text-align:left; padding:8px 12px; color:{COLORS['TextMuted']}; font-weight:700; text-transform:uppercase; font-size:0.72rem; letter-spacing:0.08em;">Axis</th>
|
||
<th style="text-align:right; padding:8px 12px; color:{_c_cloud}; font-weight:700;">Cloud (H100)</th>
|
||
<th style="text-align:right; padding:8px 12px; color:{_c_tiny}; font-weight:700;">TinyML (Cortex-M7)</th>
|
||
<th style="text-align:right; padding:8px 12px; color:{COLORS['Text']}; font-weight:700;">Gap (H100 / MCU)</th>
|
||
</tr>
|
||
</thead>
|
||
<tbody>
|
||
<tr style="border-bottom:1px solid {_c_border};">
|
||
<td style="padding:8px 12px; color:{COLORS['TextSec']}; font-weight:600;">Peak Compute (TFLOPs)</td>
|
||
<td style="text-align:right; padding:8px 12px; font-family:monospace;">{H100_TFLOPS:,}</td>
|
||
<td style="text-align:right; padding:8px 12px; font-family:monospace;">{MCU_TFLOPS:.3f}</td>
|
||
<td style="text-align:right; padding:8px 12px;">{_ratio_badge(_compute_ratio)}</td>
|
||
</tr>
|
||
<tr style="border-bottom:1px solid {_c_border};">
|
||
<td style="padding:8px 12px; color:{COLORS['TextSec']}; font-weight:600;">Memory Bandwidth (GB/s)</td>
|
||
<td style="text-align:right; padding:8px 12px; font-family:monospace;">{H100_BW_GBS:,}</td>
|
||
<td style="text-align:right; padding:8px 12px; font-family:monospace;">{MCU_BW_GBS}</td>
|
||
<td style="text-align:right; padding:8px 12px;">{_ratio_badge(_bw_ratio)}</td>
|
||
</tr>
|
||
<tr style="border-bottom:1px solid {_c_border};">
|
||
<td style="padding:8px 12px; color:{COLORS['TextSec']}; font-weight:600;">Memory Capacity</td>
|
||
<td style="text-align:right; padding:8px 12px; font-family:monospace;">{H100_RAM_GB:,} GB</td>
|
||
<td style="text-align:right; padding:8px 12px; font-family:monospace;">{MCU_SRAM_MB * 1000:.0f} KB</td>
|
||
<td style="text-align:right; padding:8px 12px;">{_ratio_badge(_ram_ratio_num)}</td>
|
||
</tr>
|
||
<tr>
|
||
<td style="padding:8px 12px; color:{COLORS['TextSec']}; font-weight:600;">Power Budget (W)</td>
|
||
<td style="text-align:right; padding:8px 12px; font-family:monospace;">{H100_TDP_W:,}</td>
|
||
<td style="text-align:right; padding:8px 12px; font-family:monospace;">{MCU_TDP_W}</td>
|
||
<td style="text-align:right; padding:8px 12px;">{_ratio_badge(_power_ratio)}</td>
|
||
</tr>
|
||
</tbody>
|
||
</table>
|
||
</div>
|
||
""")
|
||
return
|
||
|
||
|
||
# ─── ACT I: PREDICTION REVEAL ──────────────────────────────────────────────────
|
||
@app.cell(hide_code=True)
|
||
def _(H100_TFLOPS, MCU_TFLOPS, act1_pred, mo):
|
||
# Actual gap (TFLOPs ratio)
|
||
_actual_gap = H100_TFLOPS / MCU_TFLOPS # 989,000× ≈ ~10^6
|
||
|
||
_pred_map = {"A": 100, "B": 10_000, "C": 1_000_000, "D": 1_000_000_000}
|
||
_pred_val = _pred_map[act1_pred.value]
|
||
_off_factor = _actual_gap / _pred_val
|
||
|
||
_correct = act1_pred.value == "C"
|
||
|
||
if _correct:
|
||
_reveal = mo.callout(mo.md(
|
||
f"**Correct.** You predicted ~1,000,000×. "
|
||
f"The actual compute ratio is {_actual_gap:,.0f}× (H100 FP16 / Cortex-M7). "
|
||
f"This is approximately 10⁶ — six orders of magnitude. "
|
||
f"The memory bandwidth gap is similar: 3350 GB/s vs. 0.05 GB/s = 67,000×. "
|
||
f"The power gap is 7000×. Across all three D·A·M axes, the hardware is "
|
||
f"separated by 4–7 orders of magnitude — making a shared software stack "
|
||
f"physically infeasible."
|
||
), kind="success")
|
||
elif act1_pred.value == "D":
|
||
_reveal = mo.callout(mo.md(
|
||
f"**Close, but one order of magnitude too large.** "
|
||
f"You predicted ~10⁹. The actual ratio is {_actual_gap:,.0f}× (~10⁶). "
|
||
f"Nine orders of magnitude is the span across *all* D·A·M axes combined "
|
||
f"(the chapter describes this as 9 orders total when you include power, memory, "
|
||
f"and compute together). On compute alone the gap is ~10⁶. "
|
||
f"The principle holds: separate stacks are mandatory."
|
||
), kind="warn")
|
||
elif act1_pred.value == "B":
|
||
_reveal = mo.callout(mo.md(
|
||
f"**You underestimated by {_off_factor:.0f}×.** "
|
||
f"You predicted ~10,000×. The actual compute ratio is {_actual_gap:,.0f}× (~10⁶). "
|
||
f"At 10⁴, a performance difference is an engineering optimization — "
|
||
f"at 10⁶, it is an architectural boundary. The TinyML device cannot even hold "
|
||
f"a single modern model in memory, let alone execute it at useful speed."
|
||
), kind="warn")
|
||
else:
|
||
_reveal = mo.callout(mo.md(
|
||
f"**You significantly underestimated.** "
|
||
f"You predicted ~100×. The actual ratio is {_actual_gap:,.0f}× (~10⁶). "
|
||
f"A 100× gap is manageable with smart caching and batching. "
|
||
f"A 10⁶ gap is a different kind of physics: it means the hardware fundamentally "
|
||
f"cannot run the same model code, triggering the need for separate stacks, "
|
||
f"separate model formats, and separate compilation targets."
|
||
), kind="warn")
|
||
|
||
_reveal
|
||
return
|
||
|
||
|
||
# ─── ACT I: MATH PEEK ──────────────────────────────────────────────────────────
|
||
@app.cell(hide_code=True)
|
||
def _(mo):
|
||
mo.accordion({
|
||
"View the D·A·M Axes Definition": mo.md("""
|
||
**The D·A·M Taxonomy** (from @sec-introduction-scaling-regimes):
|
||
|
||
Every ML system is a three-way interaction between three axes:
|
||
|
||
| Axis | Governs | Lab Instruments |
|
||
|------|---------|----------------|
|
||
| **D — Data** | Volume moved (`D_vol`), bandwidth consumed | Memory BW bar chart |
|
||
| **A — Algorithm** | Operation count (`O`), model architecture | FLOPs bar chart |
|
||
| **M — Machine** | Peak throughput (`R_peak`), memory capacity, power | All three charts |
|
||
|
||
The D·A·M framework predicts that optimizing one axis in isolation shifts bottlenecks
|
||
rather than eliminating them. A 10× faster algorithm on a power-constrained device
|
||
does not help if the model still exceeds the device's memory capacity.
|
||
|
||
**The chapter claim** (@sec-introduction, line ~1778):
|
||
> "Modern models demand resources nine orders of magnitude larger [than early neural nets]."
|
||
|
||
The lab measures the *hardware* gap (6 orders on compute), not the historical *model* gap.
|
||
Both support the same conclusion: a universal software stack is physically impossible.
|
||
""")
|
||
})
|
||
return
|
||
|
||
|
||
# ─── ACT I: STRUCTURED REFLECTION ──────────────────────────────────────────────
|
||
@app.cell(hide_code=True)
|
||
def _(mo):
|
||
mo.md("### Act I Reflection")
|
||
return
|
||
|
||
|
||
@app.cell(hide_code=True)
|
||
def _(mo):
|
||
act1_reflect = mo.ui.radio(
|
||
options={
|
||
"A) We need faster algorithms for the TinyML device": "A",
|
||
"B) We need separate software stacks per deployment regime": "B",
|
||
"C) We need more training data to handle the hardware gap": "C",
|
||
"D) We need better compilers to bridge the compute difference": "D",
|
||
},
|
||
label="What does the 10⁶ compute gap between Cloud and TinyML imply for software architecture?",
|
||
)
|
||
act1_reflect
|
||
return (act1_reflect,)
|
||
|
||
|
||
@app.cell(hide_code=True)
|
||
def _(act1_reflect, mo):
|
||
mo.stop(
|
||
act1_reflect.value is None,
|
||
mo.md(""),
|
||
)
|
||
|
||
if act1_reflect.value == "B":
|
||
_fb = mo.callout(mo.md(
|
||
"**Correct.** The magnitude gap is not an optimization problem — it is an architectural "
|
||
"boundary. An H100 kernel compiled for CUDA cannot run on a Cortex-M7. The MCU has no "
|
||
"FP16 Tensor Cores, no HBM, and 512 KB of SRAM instead of 80 GB. The hardware gap "
|
||
"forces separate compilation targets, separate model formats (TFLite, ONNX vs. CUDA PTX), "
|
||
"and separate runtime environments. This is why @sec-ml-systems introduces four distinct "
|
||
"deployment paradigms rather than one universal stack."
|
||
), kind="success")
|
||
elif act1_reflect.value == "D":
|
||
_fb = mo.callout(mo.md(
|
||
"**Partially correct, but incomplete.** Better compilers (like TVM or MLIR) do help "
|
||
"bridge execution targets — but they cannot add SRAM that does not exist. A ResNet-50 "
|
||
"requires 100 MB of RAM at inference. The Cortex-M7 has 512 KB. No compiler can resolve "
|
||
"a 200× memory deficit. The fundamental answer is B: separate stacks, because the "
|
||
"physical constraints are incommensurable."
|
||
), kind="warn")
|
||
elif act1_reflect.value == "A":
|
||
_fb = mo.callout(mo.md(
|
||
"**Not quite.** A faster algorithm reduces operation count (the *Algorithm* axis of D·A·M), "
|
||
"but does not change memory bandwidth or capacity (the *Machine* axis). Even if an algorithm "
|
||
"ran in zero FLOPs, it would still need to load model weights — and 100 MB of weights do not "
|
||
"fit in 512 KB of SRAM. The constraint is physical, not algorithmic."
|
||
), kind="warn")
|
||
else:
|
||
_fb = mo.callout(mo.md(
|
||
"**Incorrect.** Data volume (the *Data* axis) is independent of hardware capacity. "
|
||
"More training data makes a better model but does not change the memory bandwidth "
|
||
"or SRAM size of the Cortex-M7. The hardware gap is a physical constraint on the "
|
||
"*Machine* axis that exists regardless of how much data the model was trained on."
|
||
), kind="warn")
|
||
|
||
_fb
|
||
return
|
||
|
||
|
||
# ═══════════════════════════════════════════════════════════════════════════════
|
||
# ACT II — THE IRON LAW PREVIEW
|
||
# ═══════════════════════════════════════════════════════════════════════════════
|
||
|
||
|
||
# ─── ACT II: SECTION HEADER ────────────────────────────────────────────────────
|
||
@app.cell(hide_code=True)
|
||
def _(mo):
|
||
mo.md("""
|
||
---
|
||
## Act II — The Iron Law Preview
|
||
*Design Challenge · 20-25 minutes*
|
||
""")
|
||
return
|
||
|
||
|
||
# ─── ACT II: INTRO TEXT ────────────────────────────────────────────────────────
|
||
@app.cell(hide_code=True)
|
||
def _(mo):
|
||
mo.md("""
|
||
Act I established *that* a gap exists. Act II asks: given a specific model
|
||
and a specific hardware target, which physical constraint actually dominates?
|
||
|
||
The **Iron Law of ML Systems** (@sec-introduction-iron-law-ml-systems-c32a)
|
||
decomposes total execution time into three additive terms:
|
||
|
||
```
|
||
T = D/BW + O/R + L
|
||
Data Compute Overhead
|
||
Term Term Term
|
||
```
|
||
|
||
Where:
|
||
- `D` = data volume moved (bytes)
|
||
- `BW` = memory bandwidth (bytes/sec)
|
||
- `O` = arithmetic operations (FLOPs)
|
||
- `R` = peak throughput (FLOPs/sec)
|
||
- `L` = dispatch/overhead latency (sec)
|
||
|
||
Whichever term is largest determines the **binding constraint** on that hardware.
|
||
|
||
**The workload**: ResNet-50 forward pass (batch size = 1).
|
||
ResNet-50 requires 4 GFLOPs of compute and moves approximately 100 MB of data
|
||
(weights + activations) per inference. These values are from the chapter's
|
||
Lighthouse Model definitions.
|
||
""")
|
||
return
|
||
|
||
|
||
# ─── ACT II: PREDICTION LOCK ───────────────────────────────────────────────────
|
||
@app.cell(hide_code=True)
|
||
def _(mo):
|
||
mo.md("### Your Prediction")
|
||
return
|
||
|
||
|
||
@app.cell(hide_code=True)
|
||
def _(mo):
|
||
act2_pred = mo.ui.radio(
|
||
options={
|
||
"A) Compute term dominates (O/R is largest)": "A",
|
||
"B) Memory term dominates (D/BW is largest)": "B",
|
||
"C) Both terms are approximately equal": "C",
|
||
"D) It depends on which hardware context is selected": "D",
|
||
},
|
||
label="On the H100 at batch=1, for ResNet-50 (4 GFLOPs, 100 MB): which Iron Law term dominates?",
|
||
)
|
||
act2_pred
|
||
return (act2_pred,)
|
||
|
||
|
||
@app.cell(hide_code=True)
|
||
def _(act2_pred, mo):
|
||
mo.stop(
|
||
act2_pred.value is None,
|
||
mo.callout(
|
||
mo.md("Select your prediction above to unlock the Act II instruments."),
|
||
kind="warn",
|
||
),
|
||
)
|
||
mo.md("")
|
||
return
|
||
|
||
|
||
# ─── ACT II: HARDWARE CONTEXT SELECTOR ─────────────────────────────────────────
|
||
@app.cell(hide_code=True)
|
||
def _(COLORS, mo):
|
||
act2_context = mo.ui.radio(
|
||
options={"Cloud (H100)": "cloud", "TinyML (Cortex-M7)": "tiny"},
|
||
value="Cloud (H100)",
|
||
label="Select hardware context:",
|
||
inline=True,
|
||
)
|
||
mo.hstack([
|
||
mo.Html(f"""
|
||
<div style="font-size:0.78rem; font-weight:700; color:{COLORS['TextMuted']};
|
||
text-transform:uppercase; letter-spacing:0.08em; margin-right:8px; padding-top:2px;">
|
||
Hardware:
|
||
</div>
|
||
"""),
|
||
act2_context,
|
||
], justify="start", gap=0)
|
||
return (act2_context,)
|
||
|
||
|
||
# ─── ACT II: IRON LAW COMPUTATION + OOM DETECTION ──────────────────────────────
|
||
@app.cell(hide_code=True)
|
||
def _(
|
||
COLORS,
|
||
H100_BW_GBS,
|
||
H100_RAM_GB,
|
||
H100_TFLOPS,
|
||
MCU_BW_GBS,
|
||
MCU_SRAM_MB,
|
||
MCU_TFLOPS,
|
||
act2_context,
|
||
apply_plotly_theme,
|
||
go,
|
||
mo,
|
||
):
|
||
# ── ResNet-50 workload constants (source: @sec-introduction Lighthouse Models)
|
||
# ResNet-50 single forward pass at batch=1:
|
||
RESNET50_GFLOPS = 4.0 # GFLOPs; source: He et al. (2016) / chapter Lighthouse table
|
||
RESNET50_DATA_MB = 100.0 # MB (weights + activation footprint); source: chapter Lighthouse
|
||
|
||
# Convert to consistent units
|
||
_resnet_ops_gflops = RESNET50_GFLOPS # GFLOPs = 10^9 FLOPs
|
||
_resnet_data_gb = RESNET50_DATA_MB / 1000.0 # GB
|
||
|
||
_ctx = act2_context.value
|
||
|
||
if _ctx == "cloud":
|
||
_hw_name = "NVIDIA H100"
|
||
_hw_color = COLORS["Cloud"]
|
||
_hw_tflops = H100_TFLOPS # TFLOPs = 10^12 FLOPs/s
|
||
_hw_bw = H100_BW_GBS # GB/s
|
||
_hw_ram_gb = H100_RAM_GB # GB
|
||
_overhead_ms = 0.01 # ms dispatch tax (hardware.py Cloud.H100)
|
||
else:
|
||
_hw_name = "Cortex-M7"
|
||
_hw_color = COLORS["Tiny"]
|
||
_hw_tflops = MCU_TFLOPS # TFLOPs
|
||
_hw_bw = MCU_BW_GBS # GB/s
|
||
_hw_ram_gb = MCU_SRAM_MB / 1000.0 # GB (convert KB→GB: 512KB = 0.000512 GB)
|
||
_overhead_ms = 2.0 # ms dispatch tax (hardware.py Tiny.Generic_MCU)
|
||
|
||
# ── Iron Law computation
|
||
# T_mem = D_vol (GB) / BW (GB/s) → seconds → ms
|
||
# T_comp = O (GFLOPs) / R (TFLOPs/s) = O (GFLOPs) / (R * 1000 GFLOPs/s) → seconds → ms
|
||
# (1 TFLOPs = 1000 GFLOPs)
|
||
_T_mem_s = _resnet_data_gb / _hw_bw # seconds
|
||
_T_comp_s = _resnet_ops_gflops / (_hw_tflops * 1000.0) # seconds (convert TFLOPS→GFLOPS)
|
||
_T_overhead_s = _overhead_ms / 1000.0 # seconds
|
||
|
||
_T_mem_ms = _T_mem_s * 1000.0
|
||
_T_comp_ms = _T_comp_s * 1000.0
|
||
_T_total_ms = _T_mem_ms + _T_comp_ms + _overhead_ms
|
||
|
||
# ── OOM detection
|
||
# ResNet-50 needs 100 MB of RAM at minimum (weights alone).
|
||
# Cortex-M7 has 512 KB = 0.512 MB of SRAM.
|
||
_oom = _resnet_data_gb > _hw_ram_gb
|
||
_oom_ratio = _resnet_data_gb / _hw_ram_gb if _hw_ram_gb > 0 else float("inf")
|
||
|
||
# ── Determine bottleneck
|
||
_bottleneck = "memory" if _T_mem_ms > _T_comp_ms else "compute"
|
||
|
||
# ── Color bars: red if OOM, otherwise by bottleneck
|
||
if _oom:
|
||
_bar_colors = [COLORS["RedLine"], COLORS["RedLine"], COLORS["OrangeLine"]]
|
||
else:
|
||
_bar_colors = [
|
||
COLORS["RedLine"] if _bottleneck == "memory" else COLORS["BlueLine"],
|
||
COLORS["RedLine"] if _bottleneck == "compute" else COLORS["BlueLine"],
|
||
COLORS["OrangeLine"],
|
||
]
|
||
|
||
# ── Chart
|
||
_fig = go.Figure()
|
||
_fig.add_trace(go.Bar(
|
||
x=["Memory Term (D/BW)", "Compute Term (O/R)", "Overhead (L)"],
|
||
y=[_T_mem_ms, _T_comp_ms, _overhead_ms],
|
||
marker_color=_bar_colors,
|
||
text=[f"{_T_mem_ms:.4f} ms", f"{_T_comp_ms:.4f} ms", f"{_overhead_ms:.4f} ms"],
|
||
textposition="outside",
|
||
width=0.5,
|
||
))
|
||
_fig.update_layout(
|
||
height=300,
|
||
yaxis=dict(title="Latency (ms)", type="log"),
|
||
margin=dict(t=30, b=40, l=50, r=20),
|
||
)
|
||
apply_plotly_theme(_fig)
|
||
_chart = mo.ui.plotly(_fig)
|
||
|
||
# ── Metric cards
|
||
_mem_dominant_label = "BINDING" if _bottleneck == "memory" else "not binding"
|
||
_comp_dominant_label = "BINDING" if _bottleneck == "compute" else "not binding"
|
||
_mem_card_color = COLORS["RedLine"] if _bottleneck == "memory" else COLORS["BlueLine"]
|
||
_comp_card_color = COLORS["RedLine"] if _bottleneck == "compute" else COLORS["BlueLine"]
|
||
|
||
_cards = mo.Html(f"""
|
||
<div style="display:flex; gap:16px; justify-content:center; margin:16px 0; flex-wrap:wrap;">
|
||
<div style="padding:20px 24px; border:2px solid {_mem_card_color};
|
||
border-radius:10px; min-width:170px; text-align:center;
|
||
background:{'#FEF2F2' if _bottleneck == 'memory' else '#f8fafc'};">
|
||
<div style="font-size:0.78rem; font-weight:700; color:#64748b; text-transform:uppercase; letter-spacing:0.08em;">Memory Term</div>
|
||
<div style="font-size:0.72rem; color:#94a3b8; margin:2px 0;">D / BW</div>
|
||
<div style="font-size:1.9rem; font-weight:800; color:{_mem_card_color}; font-family:monospace;">
|
||
{_T_mem_ms:.4f}
|
||
</div>
|
||
<div style="font-size:0.82rem; color:#64748b;">ms</div>
|
||
<div style="font-size:0.72rem; font-weight:700; color:{_mem_card_color}; margin-top:6px; text-transform:uppercase;">
|
||
{_mem_dominant_label}
|
||
</div>
|
||
</div>
|
||
<div style="padding:20px 24px; border:2px solid {_comp_card_color};
|
||
border-radius:10px; min-width:170px; text-align:center;
|
||
background:{'#FEF2F2' if _bottleneck == 'compute' else '#f8fafc'};">
|
||
<div style="font-size:0.78rem; font-weight:700; color:#64748b; text-transform:uppercase; letter-spacing:0.08em;">Compute Term</div>
|
||
<div style="font-size:0.72rem; color:#94a3b8; margin:2px 0;">O / R</div>
|
||
<div style="font-size:1.9rem; font-weight:800; color:{_comp_card_color}; font-family:monospace;">
|
||
{_T_comp_ms:.4f}
|
||
</div>
|
||
<div style="font-size:0.82rem; color:#64748b;">ms</div>
|
||
<div style="font-size:0.72rem; font-weight:700; color:{_comp_card_color}; margin-top:6px; text-transform:uppercase;">
|
||
{_comp_dominant_label}
|
||
</div>
|
||
</div>
|
||
<div style="padding:20px 24px; border:2px solid {COLORS['OrangeLine']};
|
||
border-radius:10px; min-width:170px; text-align:center; background:#FFF7ED;">
|
||
<div style="font-size:0.78rem; font-weight:700; color:#64748b; text-transform:uppercase; letter-spacing:0.08em;">Total Latency</div>
|
||
<div style="font-size:0.72rem; color:#94a3b8; margin:2px 0;">T = D/BW + O/R + L</div>
|
||
<div style="font-size:1.9rem; font-weight:800; color:{COLORS['OrangeLine']}; font-family:monospace;">
|
||
{_T_total_ms:.3f}
|
||
</div>
|
||
<div style="font-size:0.82rem; color:#64748b;">ms</div>
|
||
<div style="font-size:0.72rem; font-weight:700; color:{_hw_color}; margin-top:6px;">
|
||
{_hw_name}
|
||
</div>
|
||
</div>
|
||
</div>
|
||
""")
|
||
|
||
# ── Physics formula display
|
||
_formula = mo.Html(f"""
|
||
<div class="lab-card" style="margin:8px 0; background:#f8fafc; font-family:monospace; font-size:0.85rem;">
|
||
<div style="color:#64748b; font-weight:700; font-size:0.72rem; text-transform:uppercase; letter-spacing:0.08em; margin-bottom:8px;">Iron Law Computation — {_hw_name}</div>
|
||
<div style="line-height:2.0; color:#1e293b;">
|
||
<span style="color:#006395;">D/BW</span> = {_resnet_data_gb:.3f} GB ÷ {_hw_bw:,.1f} GB/s = <strong style="color:{_mem_card_color};">{_T_mem_ms:.4f} ms</strong><br>
|
||
<span style="color:#006395;">O/R</span> = {_resnet_ops_gflops:.1f} GFLOPs ÷ {_hw_tflops * 1000:.0f} GFLOPs/s = <strong style="color:{_comp_card_color};">{_T_comp_ms:.4f} ms</strong><br>
|
||
<span style="color:#CC5500;">L</span> = {_overhead_ms:.2f} ms (dispatch overhead)<br>
|
||
<strong>T_total = {_T_total_ms:.4f} ms</strong>
|
||
Bottleneck: <strong style="color:{'#CB202D' if not _oom else '#CB202D'};">
|
||
{'OOM — infeasible' if _oom else _bottleneck.upper() + '-BOUND'}
|
||
</strong>
|
||
</div>
|
||
</div>
|
||
""")
|
||
|
||
mo.vstack([_cards, _chart, _formula])
|
||
return (
|
||
RESNET50_DATA_MB,
|
||
RESNET50_GFLOPS,
|
||
_T_comp_ms,
|
||
_T_mem_ms,
|
||
_T_total_ms,
|
||
_bottleneck,
|
||
_oom,
|
||
_oom_ratio,
|
||
)
|
||
|
||
|
||
# ─── ACT II: OOM FAILURE STATE BANNER ──────────────────────────────────────────
|
||
@app.cell(hide_code=True)
|
||
def _(RESNET50_DATA_MB, MCU_SRAM_MB, _oom, _oom_ratio, mo):
|
||
if _oom:
|
||
_oom_banner = mo.callout(mo.md(
|
||
f"**OOM — Infeasible.** "
|
||
f"ResNet-50 requires {RESNET50_DATA_MB:.0f} MB of RAM for weights and activations. "
|
||
f"The Cortex-M7 has {MCU_SRAM_MB * 1000:.0f} KB of SRAM. "
|
||
f"The model exceeds available memory by **{_oom_ratio:.0f}×**. "
|
||
f"This is not a performance bottleneck — the model cannot be loaded at all. "
|
||
f"Switch back to Cloud (H100) to see a feasible execution, or use the toggle "
|
||
f"to continue exploring the hardware boundary."
|
||
), kind="danger")
|
||
else:
|
||
_oom_banner = mo.md("")
|
||
_oom_banner
|
||
return
|
||
|
||
|
||
# ─── ACT II: PREDICTION REVEAL ─────────────────────────────────────────────────
|
||
@app.cell(hide_code=True)
|
||
def _(
|
||
H100_BW_GBS,
|
||
H100_TFLOPS,
|
||
RESNET50_DATA_MB,
|
||
RESNET50_GFLOPS,
|
||
_T_comp_ms,
|
||
_T_mem_ms,
|
||
act2_pred,
|
||
mo,
|
||
):
|
||
# Reveal is always computed for H100 context (the prediction question specifies H100)
|
||
_actual_bw = H100_BW_GBS # GB/s
|
||
_actual_tflops = H100_TFLOPS # TFLOPs
|
||
_data_gb = RESNET50_DATA_MB / 1000.0
|
||
_ops_gflops = RESNET50_GFLOPS
|
||
|
||
_h100_mem_ms = (_data_gb / _actual_bw) * 1000.0
|
||
_h100_comp_ms = (_ops_gflops / (_actual_tflops * 1000.0)) * 1000.0
|
||
_h100_dominant = "memory" if _h100_mem_ms > _h100_comp_ms else "compute"
|
||
|
||
_correct_act2 = act2_pred.value == "B"
|
||
|
||
if _correct_act2:
|
||
_reveal2 = mo.callout(mo.md(
|
||
f"**Correct.** On the H100 at batch=1, the Iron Law is **memory-bound**. "
|
||
f"Memory term (D/BW) = {_h100_mem_ms:.4f} ms vs. Compute term (O/R) = {_h100_comp_ms:.5f} ms. "
|
||
f"The H100's 989 TFLOPs of compute finishes the 4 GFLOPs in {_h100_comp_ms*1000:.2f} microseconds, "
|
||
f"but loading 100 MB through 3350 GB/s takes {_h100_mem_ms*1000:.0f} microseconds. "
|
||
f"The data movement is ~{_h100_mem_ms/_h100_comp_ms:.0f}× slower than the arithmetic. "
|
||
f"This is the **Memory Wall** — it persists even on the fastest accelerators at small batch sizes."
|
||
), kind="success")
|
||
elif act2_pred.value == "A":
|
||
_reveal2 = mo.callout(mo.md(
|
||
f"**Incorrect — the opposite is true.** "
|
||
f"The H100 is so fast at arithmetic (989 TFLOPs) that 4 GFLOPs takes only "
|
||
f"{_h100_comp_ms*1000:.2f} microseconds. But 100 MB through 3350 GB/s takes "
|
||
f"{_h100_mem_ms*1000:.0f} microseconds. The memory term is "
|
||
f"~{_h100_mem_ms/_h100_comp_ms:.0f}× larger. "
|
||
f"At batch=1, almost every modern inference workload is memory-bound on cloud hardware."
|
||
), kind="warn")
|
||
elif act2_pred.value == "C":
|
||
_reveal2 = mo.callout(mo.md(
|
||
f"**Not quite.** The two terms are not equal — they differ by "
|
||
f"~{_h100_mem_ms/_h100_comp_ms:.0f}×. "
|
||
f"Memory = {_h100_mem_ms:.4f} ms, Compute = {_h100_comp_ms:.5f} ms. "
|
||
f"The H100's extreme compute throughput makes the arithmetic trivially fast "
|
||
f"at batch=1, while data movement time is determined by the memory bandwidth "
|
||
f"ceiling, which cannot be exceeded."
|
||
), kind="warn")
|
||
else:
|
||
_reveal2 = mo.callout(mo.md(
|
||
f"**Not quite.** The prediction question specifies H100 at batch=1, so the "
|
||
f"answer is deterministic for that context. On the H100, the memory term "
|
||
f"({_h100_mem_ms:.4f} ms) dominates the compute term ({_h100_comp_ms:.5f} ms) "
|
||
f"by ~{_h100_mem_ms/_h100_comp_ms:.0f}×. "
|
||
f"On the Cortex-M7, the memory term also dominates — but both terms are much "
|
||
f"larger, and the model is infeasible anyway (OOM). "
|
||
f"The binding constraint at batch=1 is always the Memory Wall."
|
||
), kind="warn")
|
||
|
||
_reveal2
|
||
return
|
||
|
||
|
||
# ─── ACT II: MATH PEEK ─────────────────────────────────────────────────────────
|
||
@app.cell(hide_code=True)
|
||
def _(mo):
|
||
mo.accordion({
|
||
"View the Iron Law — T = D/BW + O/R + L": mo.md("""
|
||
**The Iron Law of ML Systems** (@sec-introduction-iron-law-ml-systems-c32a):
|
||
|
||
```
|
||
T_total = D_vol/BW + O/(R_peak × η) + L_lat
|
||
──────── ───────────────── ─────
|
||
Data Term Compute Term Overhead
|
||
```
|
||
|
||
**Variable definitions:**
|
||
|
||
| Symbol | Meaning | ResNet-50 Value | H100 Value |
|
||
|--------|---------|----------------|------------|
|
||
| `D_vol` | Data volume moved (weights + activations) | 100 MB = 0.1 GB | — |
|
||
| `BW` | Memory bandwidth | — | 3,350 GB/s |
|
||
| `O` | Arithmetic operations | 4 GFLOPs | — |
|
||
| `R_peak` | Peak compute throughput | — | 989 TFLOPs = 989,000 GFLOPs/s |
|
||
| `η` | Hardware utilization (assumed 1.0 here) | — | 1.0 |
|
||
| `L_lat` | Dispatch / overhead latency | — | 0.01 ms |
|
||
|
||
**The systems conclusion (from @sec-introduction):**
|
||
At batch=1, the compute term becomes negligible on high-throughput accelerators.
|
||
The H100's 989 TFLOPs finishes 4 GFLOPs in ~4 nanoseconds. The memory wall
|
||
(loading 100 MB at 3350 GB/s) takes ~30 microseconds — 7,500× longer.
|
||
This is why inference serving systems use **batching**: grouping requests
|
||
increases `O` without proportionally increasing `D_vol`, moving the workload
|
||
from the memory-bound regime toward the compute-bound regime.
|
||
""")
|
||
})
|
||
return
|
||
|
||
|
||
# ─── ACT II: STRUCTURED REFLECTION ────────────────────────────────────────────
|
||
@app.cell(hide_code=True)
|
||
def _(mo):
|
||
mo.md("### Act II Reflection")
|
||
return
|
||
|
||
|
||
@app.cell(hide_code=True)
|
||
def _(mo):
|
||
act2_reflect = mo.ui.radio(
|
||
options={
|
||
"A) Too slow — the TFLOPS count is too low": "A",
|
||
"B) Memory capacity — the model exceeds available SRAM": "B",
|
||
"C) Power — the microcontroller overheats": "C",
|
||
"D) Accuracy — quantization degrades the model": "D",
|
||
},
|
||
label="What is the PRIMARY constraint preventing ResNet-50 from running on a Cortex-M7 microcontroller?",
|
||
)
|
||
act2_reflect
|
||
return (act2_reflect,)
|
||
|
||
|
||
@app.cell(hide_code=True)
|
||
def _(MCU_SRAM_MB, RESNET50_DATA_MB, act2_reflect, mo):
|
||
mo.stop(
|
||
act2_reflect.value is None,
|
||
mo.md(""),
|
||
)
|
||
|
||
if act2_reflect.value == "B":
|
||
_fb2 = mo.callout(mo.md(
|
||
f"**Correct.** ResNet-50 requires {RESNET50_DATA_MB:.0f} MB of RAM. "
|
||
f"The Cortex-M7 has {MCU_SRAM_MB * 1000:.0f} KB of SRAM — a {RESNET50_DATA_MB / MCU_SRAM_MB:.0f}× "
|
||
f"deficit. This is not a question of speed: the model literally cannot be loaded. "
|
||
f"Memory capacity is the absolute constraint. This is why TinyML requires "
|
||
f"completely different model architectures (MobileNet, EfficientNet-Lite, "
|
||
f"quantized INT8 networks) that fit within 100–200 KB, not 100 MB."
|
||
), kind="success")
|
||
elif act2_reflect.value == "A":
|
||
_fb2 = mo.callout(mo.md(
|
||
f"**True, but not the primary constraint.** Yes, the Cortex-M7's 0.001 TFLOPs "
|
||
f"would make ResNet-50 extremely slow (~4 seconds per inference). But it would "
|
||
f"never reach that calculation, because it cannot load the model into RAM first. "
|
||
f"Memory capacity failure precedes any speed analysis."
|
||
), kind="warn")
|
||
elif act2_reflect.value == "C":
|
||
_fb2 = mo.callout(mo.md(
|
||
"**Not the primary constraint.** At 0.1 W, the Cortex-M7 operates well within "
|
||
"its thermal envelope — microcontrollers are designed for sustained embedded "
|
||
"operation. Power is not the binding constraint here. Memory capacity is."
|
||
), kind="warn")
|
||
else:
|
||
_fb2 = mo.callout(mo.md(
|
||
"**Incorrect.** Quantization degrades accuracy but solves the memory problem — "
|
||
"a quantized INT8 ResNet-50 still requires ~25 MB, still exceeding 512 KB "
|
||
"by 50×. Accuracy tradeoff is downstream of feasibility. The model must first "
|
||
"fit in memory before any inference can occur."
|
||
), kind="warn")
|
||
|
||
_fb2
|
||
return
|
||
|
||
|
||
# ═══════════════════════════════════════════════════════════════════════════════
|
||
# CONNECTIONS TO ECOSYSTEM
|
||
# ═══════════════════════════════════════════════════════════════════════════════
|
||
|
||
|
||
@app.cell(hide_code=True)
|
||
def _(mo):
|
||
mo.md("""
|
||
---
|
||
## Connections
|
||
""")
|
||
return
|
||
|
||
|
||
@app.cell(hide_code=True)
|
||
def _(mo):
|
||
mo.callout(mo.md("""
|
||
**Textbook** — This lab explores the core quantitative claims of @sec-introduction:
|
||
|
||
- **D·A·M Taxonomy** (@sec-introduction-scaling-regimes): Data, Algorithm, Machine as interdependent axes.
|
||
Optimizing one shifts bottlenecks; it does not eliminate them.
|
||
- **Iron Law** (@sec-introduction-iron-law-ml-systems-c32a): `T = D/BW + O/R + L`.
|
||
The dominant term determines the optimization target.
|
||
- **Deployment Spectrum** (@sec-introduction-deployment-spectrum-a38c): Four paradigms
|
||
(Cloud, Edge, Mobile, TinyML) each governed by distinct physical constraints.
|
||
|
||
**Next Lab** — Lab 02 (The Memory Wall) applies the Iron Law across the full deployment spectrum
|
||
with the Latency Waterfall instrument, introduced for the first time in that lab.
|
||
|
||
**TinyTorch** — In Module 01, you will implement a minimal forward-pass engine and
|
||
observe the Iron Law's three terms directly in profiling output.
|
||
See `tinytorch/src/01_foundations/`.
|
||
"""), kind="info")
|
||
return
|
||
|
||
|
||
# ═══════════════════════════════════════════════════════════════════════════════
|
||
# KEY TAKEAWAYS
|
||
# ═══════════════════════════════════════════════════════════════════════════════
|
||
|
||
|
||
@app.cell(hide_code=True)
|
||
def _(mo):
|
||
mo.md("""
|
||
---
|
||
## Key Takeaways
|
||
""")
|
||
return
|
||
|
||
|
||
@app.cell(hide_code=True)
|
||
def _(mo):
|
||
mo.callout(mo.md("""
|
||
**1. The 10⁶ compute gap is an architectural boundary, not an optimization problem.**
|
||
The H100 delivers 989 TFLOPs; the Cortex-M7 delivers 0.001 TFLOPs — a 989,000× gap.
|
||
No compiler flag bridges this. The gap forces separate model architectures,
|
||
separate compilation targets, and separate runtime stacks. This is the physical
|
||
justification for the four-paradigm deployment spectrum in @sec-ml-systems.
|
||
|
||
**2. At batch=1, cloud accelerators are memory-bound — not compute-bound.**
|
||
The H100 finishes 4 GFLOPs of ResNet-50 arithmetic in ~4 nanoseconds, but moving
|
||
100 MB through HBM3 takes ~30 microseconds. The Memory Wall persists even on the
|
||
fastest hardware when batch size is 1. This is why production serving systems use
|
||
continuous batching: to make the compute term non-negligible and amortize data movement
|
||
across multiple requests.
|
||
"""), kind="success")
|
||
return
|
||
|
||
|
||
# ═══════════════════════════════════════════════════════════════════════════════
|
||
# DESIGN LEDGER SAVE + HUD
|
||
# ═══════════════════════════════════════════════════════════════════════════════
|
||
|
||
|
||
@app.cell(hide_code=True)
|
||
def _(
|
||
COLORS,
|
||
_T_comp_ms,
|
||
_T_mem_ms,
|
||
_bottleneck,
|
||
_oom,
|
||
act1_pred,
|
||
act2_context,
|
||
act2_pred,
|
||
ledger,
|
||
mo,
|
||
):
|
||
# ── Save chapter 1 results to Design Ledger
|
||
_act1_correct = act1_pred.value == "C"
|
||
_act2_correct = act2_pred.value == "B"
|
||
|
||
ledger.save(
|
||
chapter=1,
|
||
design={
|
||
"context": act2_context.value,
|
||
"act1_prediction": act1_pred.value,
|
||
"act1_correct": _act1_correct,
|
||
"act2_bottleneck": _bottleneck,
|
||
"act2_prediction": act2_pred.value,
|
||
"act2_correct": _act2_correct,
|
||
"constraint_hit": bool(_oom),
|
||
"oom_triggered": bool(_oom),
|
||
},
|
||
)
|
||
|
||
# ── HUD footer
|
||
_act1_status = "correct" if _act1_correct else "incorrect"
|
||
_act2_status = "correct" if _act2_correct else "incorrect"
|
||
_ctx_display = act2_context.value.upper()
|
||
_oom_display = "YES" if _oom else "NO"
|
||
_bn_display = _bottleneck.upper() if not _oom else "OOM"
|
||
|
||
_hud_color_act1 = COLORS["GreenLine"] if _act1_correct else COLORS["RedLine"]
|
||
_hud_color_act2 = COLORS["GreenLine"] if _act2_correct else COLORS["RedLine"]
|
||
_hud_color_oom = COLORS["RedLine"] if _oom else COLORS["GreenLine"]
|
||
|
||
mo.Html(f"""
|
||
<div class="lab-hud">
|
||
<div>
|
||
<span class="hud-label">LAB </span>
|
||
<span class="hud-value">01 · Magnitude Awakening</span>
|
||
</div>
|
||
<div>
|
||
<span class="hud-label">CONTEXT </span>
|
||
<span class="hud-value">{_ctx_display}</span>
|
||
</div>
|
||
<div>
|
||
<span class="hud-label">ACT I PRED </span>
|
||
<span style="color:{_hud_color_act1}; font-weight:700;">{act1_pred.value} ({_act1_status})</span>
|
||
</div>
|
||
<div>
|
||
<span class="hud-label">ACT II PRED </span>
|
||
<span style="color:{_hud_color_act2}; font-weight:700;">{act2_pred.value} ({_act2_status})</span>
|
||
</div>
|
||
<div>
|
||
<span class="hud-label">BOTTLENECK </span>
|
||
<span class="hud-value">{_bn_display}</span>
|
||
</div>
|
||
<div>
|
||
<span class="hud-label">OOM </span>
|
||
<span style="color:{_hud_color_oom}; font-weight:700;">{_oom_display}</span>
|
||
</div>
|
||
<div>
|
||
<span class="hud-label">LEDGER </span>
|
||
<span class="hud-active">CH01 SAVED</span>
|
||
</div>
|
||
</div>
|
||
""")
|
||
return
|
||
|
||
|
||
if __name__ == "__main__":
|
||
app.run()
|