Files
cs249r_book/labs/vol1/lab_10_model_compress.py
Vijay Janapa Reddi 6f5732558f feat: add complete first-draft labs for both volumes (33 Marimo labs)
Add all Vol1 (labs 01-16) and Vol2 (labs 01-17) interactive Marimo labs
as the first full first-pass implementation of the ML Systems curriculum labs.

Each lab follows the PROTOCOL 2-Act structure (35-40 min):
- Act I: Calibration with prediction lock → instruments → overlay
- Act II: Design challenge with failure states and reflection

Key pedagogical instruments introduced progressively:
- Vol1: D·A·M Triad, Iron Law, Memory Ledger, Roofline, Amdahl's Law,
  Little's Law, P99 Histogram, Compression Frontier, Chouldechova theorem
- Vol2: NVLink vs PCIe cliff, Bisection BW, Young-Daly T*, Parallelism Paradox,
  AllReduce ring vs tree, KV-cache model, Jevons Paradox, DP ε-δ tradeoff,
  SLO composition, Adversarial Pareto, two-volume synthesis capstone

All 35 staged files pass AST syntax verification (36/36 including lab_00).

Also includes:
- labs/LABS_SPEC.md: authoritative sub-agent brief for all lab conventions
- labs/core/style.py: expanded unified design system with semantic color tokens
2026-03-01 19:59:04 -05:00

1650 lines
74 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
import marimo
__generated_with = "0.19.6"
app = marimo.App(width="full")
# ─────────────────────────────────────────────────────────────────────────────
# LAB 10: THE COMPRESSION FRONTIER
#
# Chapter: ML Optimizations — Model Compression (@sec-optimizations-model-compression)
# Core Invariant:
# Model compression is NOT free. Every technique moves on a Pareto frontier
# between model quality and resource savings. INT8 quantization gives ~4× memory
# reduction with <1% accuracy drop; INT4 gives ~8× with 25% drop; unstructured
# pruning is unpredictable on real hardware because dense kernels ignore zeros.
#
# 2 Contexts: Cloud (H100, 80 GB) vs Mobile (NPU, 8 GB)
#
# Act I — The Quantization Surprise (1215 min)
# Stakeholder: Mobile App Team Lead
# Prediction: Is INT8 quantization "lossless"?
# Instrument: Quantization impact table — memory, accuracy, latency, energy
# across FP32 / FP16 / INT8 / INT4 / INT2 for selectable model.
# Reveal: Prediction-vs-reality overlay showing actual accuracy drop.
# Reflection: Why does INT8 preserve accuracy better than INT4?
#
# Act II — The Compression Trade-off Frontier (FIRST INTRODUCTION, 2025 min)
# Stakeholder: Platform Engineering Lead
# Prediction: Best compression strategy for 3 mobile deployment tiers
# Instrument: Compression Trade-off Frontier — Pareto scatter (size vs quality)
# + tier budget dropdowns + metric cards.
# Failure state: OOM danger callout when selection exceeds tier memory budget.
# Reflection: Why is unstructured pruning hardware-inefficient?
#
# Design Ledger: chapter=10, context, compression_method, compression_ratio,
# act1_prediction, act1_correct, act2_result, act2_decision,
# constraint_hit, pareto_optimal
# ─────────────────────────────────────────────────────────────────────────────
# ── CELL 0: SETUP (hide_code=False — leave visible for instructor inspection) ─
@app.cell
def _():
import marimo as mo
import sys
import math
from pathlib import Path
import plotly.graph_objects as go
import numpy as np
from plotly.subplots import make_subplots
_root = Path(__file__).resolve().parents[2]
if str(_root) not in sys.path:
sys.path.insert(0, str(_root))
from labs.core.state import DesignLedger
from labs.core.style import COLORS, LAB_CSS, apply_plotly_theme
# ── Hardware constants (LABS_SPEC.md / NVIDIA and Apple specs) ────────────
H100_BW_GBS = 3350 # GB/s — H100 SXM5 HBM3e bandwidth
H100_TFLOPS_FP16 = 1979 # TFLOPS FP16 tensor core peak
H100_TFLOPS_INT8 = 3958 # TOPS INT8 tensor core (2× FP16)
H100_RAM_GB = 80 # GB HBM3e capacity
H100_TDP_W = 700 # Watts TDP
MOBILE_BW_GBS = 68 # GB/s — Apple A17-class SoC
MOBILE_TOPS_INT8 = 35 # TOPS INT8 NPU
MOBILE_RAM_GB = 8 # GB total unified memory
MOBILE_TDP_W = 5 # Watts sustained (thermal throttle ceiling)
# ── Bytes per value for each numeric format ────────────────────────────────
# Source: IEEE 754 / INT quantization: FP32=4B, FP16=2B, INT8=1B, INT4=0.5B, INT2=0.25B
DTYPE_BYTES = {
"fp32": 4.0,
"fp16": 2.0,
"int8": 1.0,
"int4": 0.5,
"int2": 0.25,
}
# ── ResNet-50 reference parameters ────────────────────────────────────────
# Source: @sec-optimizations-model-compression — canonical numbers
RESNET50_PARAMS_M = 25.6 # million parameters
RESNET50_FP32_MB = 98.0 # MB in FP32 (25.6M × 4B ≈ 102 MB, with overhead)
RESNET50_TOP1_ACC = 76.1 # % ImageNet top-1 (torchvision baseline)
# ── MobileNetV3-Large reference ───────────────────────────────────────────
MOBILENETV3_PARAMS_M = 5.4 # million parameters
MOBILENETV3_FP32_MB = 21.1 # MB in FP32
MOBILENETV3_TOP1_ACC = 75.8 # % ImageNet top-1
# ── ViT-Base/16 reference ─────────────────────────────────────────────────
VITBASE_PARAMS_M = 86.0 # million parameters
VITBASE_FP32_MB = 330.0 # MB in FP32
VITBASE_TOP1_ACC = 81.1 # % ImageNet top-1
# ── LLaMA-3 8B reference ──────────────────────────────────────────────────
LLAMA3_8B_PARAMS_B = 8.0 # billion parameters
LLAMA3_8B_FP32_GB = 32.0 # GB in FP32 (8B × 4B)
LLAMA3_8B_PPL = 6.14 # perplexity on WikiText-2 (FP32 baseline)
ledger = DesignLedger()
return (
mo, go, np, math, make_subplots,
ledger, COLORS, LAB_CSS, apply_plotly_theme,
H100_BW_GBS, H100_TFLOPS_FP16, H100_TFLOPS_INT8, H100_RAM_GB, H100_TDP_W,
MOBILE_BW_GBS, MOBILE_TOPS_INT8, MOBILE_RAM_GB, MOBILE_TDP_W,
DTYPE_BYTES,
RESNET50_PARAMS_M, RESNET50_FP32_MB, RESNET50_TOP1_ACC,
MOBILENETV3_PARAMS_M, MOBILENETV3_FP32_MB, MOBILENETV3_TOP1_ACC,
VITBASE_PARAMS_M, VITBASE_FP32_MB, VITBASE_TOP1_ACC,
LLAMA3_8B_PARAMS_B, LLAMA3_8B_FP32_GB, LLAMA3_8B_PPL,
)
# ── CELL 1: HEADER ────────────────────────────────────────────────────────────
@app.cell(hide_code=True)
def _(mo, LAB_CSS, COLORS):
_mobile_color = COLORS["Mobile"]
_cloud_color = COLORS["Cloud"]
mo.vstack([
LAB_CSS,
mo.Html(f"""
<div style="background: linear-gradient(135deg, #0f172a 0%, #1e293b 100%);
padding: 36px 44px; border-radius: 16px; color: white;
box-shadow: 0 8px 32px rgba(0,0,0,0.3);">
<div style="font-size: 0.72rem; font-weight: 700; letter-spacing: 0.18em;
color: #475569; text-transform: uppercase; margin-bottom: 10px;">
Machine Learning Systems &middot; Volume I &middot; Lab 10
</div>
<h1 style="margin: 0 0 10px 0; font-size: 2.4rem; font-weight: 900;
color: #f8fafc; line-height: 1.1; letter-spacing: -0.02em;">
The Compression Frontier
</h1>
<p style="margin: 0 0 20px 0; font-size: 1.05rem; color: #94a3b8;
max-width: 700px; line-height: 1.65;">
Every compression technique trades model quality for resource savings.
INT8 quantization achieves 4&times; size reduction with under 1% accuracy
drop. INT4 reaches 8&times; but costs 2&ndash;5%. Unstructured pruning
often yields no speedup at all. The frontier is real, and you cannot
move along it for free.
</p>
<div style="display: flex; gap: 12px; flex-wrap: wrap;">
<span style="background: rgba(99,102,241,0.15); color: #a5b4fc;
padding: 5px 14px; border-radius: 20px; font-size: 0.8rem;
font-weight: 600; border: 1px solid rgba(99,102,241,0.25);">
Act I: The Quantization Surprise &middot; 12&ndash;15 min
</span>
<span style="background: rgba(204,85,0,0.15); color: #fdba74;
padding: 5px 14px; border-radius: 20px; font-size: 0.8rem;
font-weight: 600; border: 1px solid rgba(204,85,0,0.25);">
Act II: The Compression Frontier &middot; 20&ndash;25 min
</span>
<span style="background: rgba(16,185,129,0.15); color: #6ee7b7;
padding: 5px 14px; border-radius: 20px; font-size: 0.8rem;
font-weight: 600; border: 1px solid rgba(16,185,129,0.25);">
35&ndash;40 min total
</span>
</div>
<div style="display: flex; gap: 12px; flex-wrap: wrap; margin-top: 12px;">
<span class="badge badge-info">First use: Compression Trade-off Frontier</span>
<span class="badge badge-warn">Memory budget failure state active</span>
</div>
</div>
"""),
])
return
# ── CELL 2: RECOMMENDED READING ───────────────────────────────────────────────
@app.cell(hide_code=True)
def _(mo):
mo.callout(mo.md("""
**Recommended Reading** — Complete the following before this lab:
- **@sec-optimizations-quantization** — Uniform quantization: the `Q(x) = round(x/scale + zero_point)` formula, scale calibration, PTQ vs QAT, hardware native support for INT8 vs INT4.
- **@sec-optimizations-pruning** — Unstructured vs structured pruning, the Lottery Ticket Hypothesis, why sparse weights do not automatically yield latency improvements on dense hardware.
- **@sec-optimizations-model-compression** — The accuracy-size Pareto frontier, compression ratio definition, why INT8 is the practical sweet spot for most deployments.
- **@sec-optimizations-knowledge-distillation** — Distillation as an alternative compression axis.
"""), kind="info")
return
# ── CELL 3: CONTEXT TOGGLE ────────────────────────────────────────────────────
@app.cell(hide_code=True)
def _(mo):
context_toggle = mo.ui.radio(
options={
"Cloud (H100 — 80 GB HBM, 700 W)": "cloud",
"Mobile (NPU — 8 GB, 5 W sustained)": "mobile",
},
value="Cloud (H100 — 80 GB HBM, 700 W)",
label="Deployment context:",
inline=True,
)
mo.vstack([
mo.md("---"),
mo.md("**Select your deployment context.** Hardware constraints differ by more than 10× across these two environments."),
context_toggle,
])
return (context_toggle,)
# ── CELL 4: CONTEXT SPEC CARD ─────────────────────────────────────────────────
@app.cell(hide_code=True)
def _(mo, context_toggle, COLORS):
_ctx = context_toggle.value
if _ctx == "cloud":
_accent = COLORS["Cloud"]
_bg = "#f0f4ff"
_border = "#c7d2fe"
_specs = [
("Device", "NVIDIA H100 SXM5"),
("HBM Capacity", "80 GB"),
("Memory Bandwidth", "3,350 GB/s"),
("FP16 Peak", "1,979 TFLOPS"),
("INT8 Peak", "3,958 TOPS (2x FP16)"),
("Power Budget", "700 W TDP"),
("INT8 native support", "Yes — Tensor Cores"),
("INT4 native support", "Yes — Tensor Cores"),
]
else:
_accent = COLORS["Mobile"]
_bg = "#fff7ed"
_border = "#fed7aa"
_specs = [
("Device", "Mobile NPU (Apple A17-class)"),
("RAM Capacity", "8 GB unified"),
("Memory Bandwidth", "68 GB/s"),
("INT8 Peak", "35 TOPS"),
("FP16 throughput", "~0.5x INT8 (software emulation path)"),
("Power Budget", "5 W sustained"),
("INT8 native support", "Yes — Neural Engine"),
("INT4 native support", "Partial — model-dependent"),
]
_rows = "".join(
f'<div style="display:flex; justify-content:space-between; padding:5px 0; '
f'border-bottom:1px solid {_border}; font-size:0.85rem;">'
f'<span style="color:#475569; font-weight:600;">{k}</span>'
f'<span style="font-family:monospace; color:{_accent}; font-weight:700;">{v}</span>'
f'</div>'
for k, v in _specs
)
mo.Html(f"""
<div style="background:{_bg}; border:1px solid {_border}; border-left:4px solid {_accent};
border-radius:8px; padding:16px 20px; margin: 8px 0;">
<div style="font-size:0.72rem; font-weight:700; color:{_accent}; text-transform:uppercase;
letter-spacing:0.1em; margin-bottom:10px;">
Active Context — Hardware Constraints
</div>
{_rows}
</div>
""")
return
# ═════════════════════════════════════════════════════════════════════════════
# ACT I — THE QUANTIZATION SURPRISE
# ═════════════════════════════════════════════════════════════════════════════
# ── CELL 5: ACT I SCENARIO ────────────────────────────────────────────────────
@app.cell(hide_code=True)
def _(mo, COLORS):
_color = COLORS["Mobile"]
mo.vstack([
mo.md("---"),
mo.Html(f"""
<div style="background: linear-gradient(90deg, #0f172a, #1e293b);
padding: 10px 20px; border-radius: 8px; margin: 8px 0;">
<span style="font-size:0.72rem; font-weight:700; color:#6366f1;
text-transform:uppercase; letter-spacing:0.15em;">
Act I &middot; Calibration &middot; 12&ndash;15 min
</span>
<span style="font-size:1.2rem; font-weight:800; color:#f8fafc; margin-left:16px;">
The Quantization Surprise
</span>
</div>
"""),
mo.Html(f"""
<div style="border-left:4px solid {_color}; background:#fff7ed;
border-radius:0 10px 10px 0; padding:16px 22px; margin:12px 0;">
<div style="font-size:0.72rem; font-weight:700; color:{_color};
text-transform:uppercase; letter-spacing:0.1em; margin-bottom:6px;">
Incoming Message &middot; Mobile App Team Lead
</div>
<div style="font-style:italic; font-size:1.0rem; color:#1e293b; line-height:1.65;">
"We have a ResNet-50 running at 98 MB in FP32. Our App Store limit
is 25 MB for the on-device model. A colleague told me INT8 quantization
is mathematically lossless — it just changes the number format, so
accuracy is preserved. Is that true? Can we ship INT8 with zero quality
regression?"
</div>
</div>
"""),
mo.md("""
The team lead has heard that quantization is "lossless." Before you run the
instruments, commit to a prediction. The chapter established
(@sec-optimizations-quantization) that uniform quantization introduces a
rounding error bounded by half the step size. The question is how large that
error is in practice on a real model.
"""),
])
return
# ── CELL 6: ACT I PREDICTION LOCK ─────────────────────────────────────────────
@app.cell(hide_code=True)
def _(mo):
act1_prediction = mo.ui.radio(
options={
"A) Yes — INT8 is mathematically equivalent to FP32 for inference": "A",
"B) Under 0.1% accuracy drop — essentially lossless for practical purposes": "B",
"C) 0.5 to 2% accuracy drop — practically acceptable but not zero": "C",
"D) 5 to 10% accuracy drop — unacceptable for production use": "D",
},
label=(
"Applying INT8 post-training quantization (PTQ) to ResNet-50 (FP32 baseline: "
"76.1% ImageNet top-1). What accuracy change do you expect?"
),
)
mo.vstack([
mo.Html("""
<div style="background: #1e293b; border-radius: 12px; padding: 20px;
border-left: 4px solid #6366f1; margin: 8px 0;">
<div style="font-size: 0.72rem; font-weight: 700; color: #a5b4fc;
text-transform: uppercase; letter-spacing: 0.1em; margin-bottom: 10px;">
Prediction Lock — Act I
</div>
<div style="color: #e2e8f0; font-size: 0.88rem; margin-bottom: 12px;">
Commit before touching any controls. Your prediction will be
compared to the actual result at the end of this act.
</div>
</div>
"""),
act1_prediction,
])
return (act1_prediction,)
# ── CELL 7: ACT I GATE ────────────────────────────────────────────────────────
@app.cell(hide_code=True)
def _(mo, act1_prediction):
mo.stop(
act1_prediction.value is None,
mo.callout(
mo.md("Select your prediction above to unlock the quantization instrument."),
kind="warn",
),
)
return
# ── CELL 8: ACT I CONTROLS ────────────────────────────────────────────────────
@app.cell(hide_code=True)
def _(mo):
act1_model = mo.ui.dropdown(
options={
"ResNet-50 (25.6 M params, 98 MB FP32)": "resnet50",
"MobileNetV3-Large (5.4 M params, 21 MB FP32)": "mobilenetv3",
"ViT-Base/16 (86 M params, 330 MB FP32)": "vitbase",
},
value="ResNet-50 (25.6 M params, 98 MB FP32)",
label="Model architecture",
)
act1_quant_scheme = mo.ui.dropdown(
options={
"PTQ — Post-Training Quantization (no retraining)": "ptq",
"QAT — Quantization-Aware Training (fine-tuned)": "qat",
},
value="PTQ — Post-Training Quantization (no retraining)",
label="Quantization scheme",
)
act1_calib_size = mo.ui.slider(
start=128, stop=4096, value=512, step=128,
label="Calibration dataset size (PTQ only)",
)
mo.vstack([
mo.md("### Quantization Impact Table — Controls"),
mo.hstack([act1_model, act1_quant_scheme], justify="start", gap="2rem"),
mo.hstack([act1_calib_size], justify="start"),
mo.callout(mo.md(
"**PTQ vs QAT:** Post-training quantization requires only a calibration "
"dataset (no gradient computation). QAT fine-tunes with simulated quantization "
"and recovers 0.20.5% additional accuracy, but requires GPU training time."
), kind="info"),
])
return (act1_model, act1_quant_scheme, act1_calib_size)
# ── CELL 9: ACT I PHYSICS ENGINE + QUANTIZATION TABLE ─────────────────────────
@app.cell(hide_code=True)
def _(
mo,
act1_model, act1_quant_scheme, act1_calib_size,
context_toggle,
RESNET50_PARAMS_M, RESNET50_FP32_MB, RESNET50_TOP1_ACC,
MOBILENETV3_PARAMS_M, MOBILENETV3_FP32_MB, MOBILENETV3_TOP1_ACC,
VITBASE_PARAMS_M, VITBASE_FP32_MB, VITBASE_TOP1_ACC,
H100_TDP_W, MOBILE_TDP_W,
DTYPE_BYTES,
):
# ── Model lookup ──────────────────────────────────────────────────────────
_model_key = act1_model.value
_scheme = act1_quant_scheme.value
_calib = act1_calib_size.value
_ctx = context_toggle.value
_MODEL_SPECS = {
"resnet50": {"params_m": RESNET50_PARAMS_M, "fp32_mb": RESNET50_FP32_MB, "base_acc": RESNET50_TOP1_ACC},
"mobilenetv3": {"params_m": MOBILENETV3_PARAMS_M, "fp32_mb": MOBILENETV3_FP32_MB, "base_acc": MOBILENETV3_TOP1_ACC},
"vitbase": {"params_m": VITBASE_PARAMS_M, "fp32_mb": VITBASE_FP32_MB, "base_acc": VITBASE_TOP1_ACC},
}
_spec = _MODEL_SPECS[_model_key]
_base_acc = _spec["base_acc"]
_fp32_mb = _spec["fp32_mb"]
# ── Calibration quality multiplier ────────────────────────────────────────
# Source: @sec-optimizations-quantization — larger calibration sets reduce
# the range estimation error. Effect saturates beyond ~1024 samples.
# Small calib (<256): adds ~0.2% extra accuracy penalty; large (>2048): minimal effect.
_calib_penalty = 0.0
if _scheme == "ptq":
if _calib < 256:
_calib_penalty = 0.25
elif _calib < 512:
_calib_penalty = 0.10
else:
_calib_penalty = 0.0
# ── QAT recovery bonus ────────────────────────────────────────────────────
# Source: @sec-optimizations-quantization — QAT trains with simulated quantization
# noise, recovering 0.20.5% accuracy compared to PTQ at the same bit-width.
_qat_recovery = 0.35 if _scheme == "qat" else 0.0
# ── Accuracy drop model ───────────────────────────────────────────────────
# Source: @sec-optimizations-model-compression empirical figures:
# FP32 -> FP16: <0.05% (rounding only, 8-bit exponent preserved)
# FP16 -> INT8: 0.30.7% PTQ (linear range mapping loses outlier precision)
# INT8 -> INT4: 1.53.5% PTQ (4x more quantization bins lost)
# INT4 -> INT2: 48% PTQ (severe representational collapse)
# ViT is more sensitive to quantization than ResNets due to attention softmax.
_SENSITIVITY = {
"resnet50": 1.0,
"mobilenetv3": 0.85,
"vitbase": 1.35,
}
_s = _SENSITIVITY[_model_key]
_ACC_DROP_PTQ = {
"fp32": 0.00,
"fp16": 0.04 * _s,
"int8": 0.50 * _s + _calib_penalty,
"int4": 2.60 * _s + _calib_penalty * 1.5,
"int2": 6.80 * _s + _calib_penalty * 2.0,
}
_ACC_DROP = {
k: max(0.0, v - _qat_recovery * (v / (_ACC_DROP_PTQ["int4"] + 0.001)))
for k, v in _ACC_DROP_PTQ.items()
}
# ── Memory size by format ──────────────────────────────────────────────────
# Source: @sec-optimizations-quantization
# compression ratio = FP32_bytes / target_bytes
_MEM_MB = {
fmt: _fp32_mb * (bpv / 4.0)
for fmt, bpv in DTYPE_BYTES.items()
}
# ── Latency model ──────────────────────────────────────────────────────────
# Source: @sec-optimizations-model-compression — memory-bandwidth bound inference.
# Latency ≈ model_size_bytes / memory_bandwidth × 1000 (ms)
# Cloud: H100 3350 GB/s, natively supports INT8/INT4 at 2× throughput.
# Mobile: NPU 68 GB/s, FP16 uses software fallback path (~2× slower than INT8).
_CLOUD_LATENCY_FACTOR = {
"fp32": 1.0,
"fp16": 0.50,
"int8": 0.25,
"int4": 0.15,
"int2": 0.12,
}
_MOBILE_LATENCY_FACTOR = {
"fp32": 4.0,
"fp16": 1.80,
"int8": 1.00,
"int4": 0.65,
"int2": 0.90,
}
# Absolute latency calibration (ms, batch=1)
_CLOUD_FP32_BASE_MS = 1.2 if _model_key == "resnet50" else (0.3 if _model_key == "mobilenetv3" else 4.8)
_MOBILE_INT8_BASE_MS = 4.5 if _model_key == "resnet50" else (1.1 if _model_key == "mobilenetv3" else 18.0)
if _ctx == "cloud":
_LAT_BASE = _CLOUD_FP32_BASE_MS
_LAT_FACTOR = _CLOUD_LATENCY_FACTOR
else:
_LAT_BASE = _MOBILE_INT8_BASE_MS / _MOBILE_LATENCY_FACTOR["int8"]
_LAT_FACTOR = _MOBILE_LATENCY_FACTOR
_POWER_W = H100_TDP_W if _ctx == "cloud" else MOBILE_TDP_W
# ── Build per-format rows ─────────────────────────────────────────────────
_formats = ["fp32", "fp16", "int8", "int4", "int2"]
_fmt_labels = {"fp32": "FP32", "fp16": "FP16", "int8": "INT8", "int4": "INT4", "int2": "INT2"}
_rows = []
for _fmt in _formats:
_acc_val = _base_acc - _ACC_DROP[_fmt]
_drop_val = _ACC_DROP[_fmt]
_mem_val = _MEM_MB[_fmt]
_lat_val = _LAT_BASE * _LAT_FACTOR[_fmt]
_energy_mj = _lat_val * _POWER_W / 1000.0
_cr = _fp32_mb / _mem_val
if _drop_val < 0.15:
_acc_color = "#008F45"
elif _drop_val < 1.0:
_acc_color = "#CC5500"
else:
_acc_color = "#CB202D"
_rows.append({
"fmt": _fmt,
"label": _fmt_labels[_fmt],
"accuracy": _acc_val,
"drop": _drop_val,
"acc_color": _acc_color,
"mem_mb": _mem_val,
"cr": _cr,
"lat_ms": _lat_val,
"energy_mj": _energy_mj,
})
# ── HTML table ────────────────────────────────────────────────────────────
_HEADER_STYLE = (
"background:#1e293b; color:#94a3b8; font-size:0.72rem; font-weight:700; "
"text-transform:uppercase; letter-spacing:0.08em; padding:8px 12px; "
"text-align:right; white-space:nowrap;"
)
_CELL_STYLE = "padding:8px 12px; text-align:right; font-size:0.88rem; font-family:monospace;"
_table_rows_html = ""
for _r in _rows:
_is_int8 = _r["fmt"] == "int8"
_bg_row = "background:#f0fdf4;" if _is_int8 else ""
_drop_color = "#CB202D" if _r["drop"] > 1.5 else ("#CC5500" if _r["drop"] > 0.3 else "#008F45")
_table_rows_html += (
f'<tr style="{_bg_row}border-bottom:1px solid #e2e8f0;">'
f'<td style="padding:8px 12px; font-weight:800; font-size:0.88rem; color:#0f172a;">'
f'{_r["label"]}</td>'
f'<td style="{_CELL_STYLE} color:{_r["acc_color"]}; font-weight:700;">'
f'{_r["accuracy"]:.2f}%</td>'
f'<td style="{_CELL_STYLE} color:{_drop_color}; font-weight:700;">'
f'-{_r["drop"]:.2f}%</td>'
f'<td style="{_CELL_STYLE} color:#006395; font-weight:700;">'
f'{_r["mem_mb"]:.1f} MB</td>'
f'<td style="{_CELL_STYLE} color:#475569;">'
f'{_r["cr"]:.1f}x</td>'
f'<td style="{_CELL_STYLE} color:#475569;">'
f'{_r["lat_ms"]:.2f} ms</td>'
f'<td style="{_CELL_STYLE} color:#475569;">'
f'{_r["energy_mj"]:.3f} mJ</td>'
f'</tr>'
)
_ctx_label = "Cloud (H100)" if _ctx == "cloud" else "Mobile (NPU)"
_scheme_label = "PTQ" if _scheme == "ptq" else "QAT"
_calib_note = f" &middot; Calib: {_calib} samples" if _scheme == "ptq" else ""
mo.Html(f"""
<div style="margin: 16px 0;">
<div style="font-size:0.72rem; font-weight:700; color:#475569;
text-transform:uppercase; letter-spacing:0.1em; margin-bottom:8px;">
Quantization Impact Table &mdash; {_model_key.upper()}
&middot; {_scheme_label} &middot; {_ctx_label}{_calib_note}
</div>
<div style="overflow-x:auto; border-radius:12px; border:1px solid #e2e8f0;
box-shadow:0 2px 8px rgba(0,0,0,0.04);">
<table style="width:100%; border-collapse:collapse; min-width:600px;">
<thead>
<tr>
<th style="{_HEADER_STYLE} text-align:left;">Format</th>
<th style="{_HEADER_STYLE}">Accuracy (Top-1)</th>
<th style="{_HEADER_STYLE}">Accuracy Drop</th>
<th style="{_HEADER_STYLE}">Model Size</th>
<th style="{_HEADER_STYLE}">Compression</th>
<th style="{_HEADER_STYLE}">Inference Latency</th>
<th style="{_HEADER_STYLE}">Energy / Inference</th>
</tr>
</thead>
<tbody>
{_table_rows_html}
</tbody>
</table>
</div>
<div style="margin-top:8px; font-size:0.78rem; color:#94a3b8; line-height:1.5;">
INT8 highlighted — practical sweet spot: 4x compression,
under 1% accuracy penalty on PTQ with adequate calibration.
</div>
</div>
""")
return
# ── CELL 10: ACT I ACCURACY/SIZE CHART ────────────────────────────────────────
@app.cell(hide_code=True)
def _(
mo, go, make_subplots, COLORS,
act1_model, act1_quant_scheme, act1_calib_size,
RESNET50_FP32_MB, RESNET50_TOP1_ACC,
MOBILENETV3_FP32_MB, MOBILENETV3_TOP1_ACC,
VITBASE_FP32_MB, VITBASE_TOP1_ACC,
DTYPE_BYTES,
):
# ── Replicate accuracy-drop model (same physics as cell 9) ───────────────
_model_key = act1_model.value
_scheme = act1_quant_scheme.value
_calib = act1_calib_size.value
_MODEL_LOOKUP = {
"resnet50": {"fp32_mb": RESNET50_FP32_MB, "base_acc": RESNET50_TOP1_ACC},
"mobilenetv3": {"fp32_mb": MOBILENETV3_FP32_MB, "base_acc": MOBILENETV3_TOP1_ACC},
"vitbase": {"fp32_mb": VITBASE_FP32_MB, "base_acc": VITBASE_TOP1_ACC},
}
_spec2 = _MODEL_LOOKUP[_model_key]
_base_acc = _spec2["base_acc"]
_fp32_mb = _spec2["fp32_mb"]
_calib_penalty = 0.25 if (_scheme == "ptq" and _calib < 256) else (0.10 if (_scheme == "ptq" and _calib < 512) else 0.0)
_qat_recovery = 0.35 if _scheme == "qat" else 0.0
_SENS2 = {"resnet50": 1.0, "mobilenetv3": 0.85, "vitbase": 1.35}
_s2 = _SENS2[_model_key]
_ACC_DROP2_PTQ = {
"fp32": 0.00,
"fp16": 0.04 * _s2,
"int8": 0.50 * _s2 + _calib_penalty,
"int4": 2.60 * _s2 + _calib_penalty * 1.5,
"int2": 6.80 * _s2 + _calib_penalty * 2.0,
}
_ACC_DROP2 = {
k: max(0.0, v - _qat_recovery * (v / (_ACC_DROP2_PTQ["int4"] + 0.001)))
for k, v in _ACC_DROP2_PTQ.items()
}
_formats = ["fp32", "fp16", "int8", "int4", "int2"]
_fmt_labels = ["FP32", "FP16", "INT8", "INT4", "INT2"]
_acc_vals = [_base_acc - _ACC_DROP2[f] for f in _formats]
_mem_mb = [_fp32_mb * (DTYPE_BYTES[f] / 4.0) for f in _formats]
_bar_colors = [
COLORS["BlueLine"] if _ACC_DROP2[f] < 0.15
else COLORS["OrangeLine"] if _ACC_DROP2[f] < 1.0
else COLORS["RedLine"]
for f in _formats
]
_fig_a1 = make_subplots(
rows=1, cols=2,
subplot_titles=("Accuracy by Format (Top-1 %)", "Model Size by Format (MB)"),
horizontal_spacing=0.12,
)
_fig_a1.add_trace(
go.Bar(
name="Accuracy", x=_fmt_labels, y=_acc_vals,
marker_color=_bar_colors,
text=[f"{v:.2f}%" for v in _acc_vals],
textposition="outside",
textfont=dict(size=11, family="SF Mono, monospace"),
),
row=1, col=1,
)
_fig_a1.add_hline(
y=_base_acc, row=1, col=1,
line_color=COLORS["GreenLine"], line_dash="dash", line_width=1.5,
annotation_text="FP32 baseline",
annotation_font_color=COLORS["GreenLine"],
annotation_position="right",
)
_fig_a1.add_trace(
go.Bar(
name="Size (MB)", x=_fmt_labels, y=_mem_mb,
marker_color=[COLORS["BlueLine"]] * len(_formats),
text=[f"{v:.1f} MB" for v in _mem_mb],
textposition="outside",
textfont=dict(size=11, family="SF Mono, monospace"),
showlegend=False,
),
row=1, col=2,
)
_fig_a1.add_hline(
y=25, row=1, col=2,
line_color=COLORS["OrangeLine"], line_dash="dot", line_width=2,
annotation_text="25 MB App Store target",
annotation_font_color=COLORS["OrangeLine"],
annotation_position="right",
)
_fig_a1.update_layout(
height=380, showlegend=False,
plot_bgcolor="white", paper_bgcolor="white",
font_family="Inter, sans-serif",
margin=dict(l=40, r=140, t=50, b=40),
)
_fig_a1.update_yaxes(gridcolor="#f1f5f9", row=1, col=1)
_fig_a1.update_yaxes(gridcolor="#f1f5f9", row=1, col=2)
_fig_a1.update_xaxes(linecolor=COLORS["Border"])
mo.vstack([
mo.md("### Accuracy and Size Trade-off by Format"),
mo.plotly(_fig_a1),
])
return
# ── CELL 11: ACT I MATHPEEK ───────────────────────────────────────────────────
@app.cell(hide_code=True)
def _(mo):
mo.accordion({
"The governing equation — Uniform Quantization": mo.md("""
**Uniform Quantization Formula** (@sec-optimizations-quantization):
```
Q(x) = round(x / scale + zero_point)
```
**Scale calculation (symmetric, per-tensor):**
```
scale = max(|x|) / (2^(bits-1) - 1)
```
For INT8 (bits=8): `scale = max(|x|) / 127`
**Quantization error bound:**
```
|epsilon_Q| <= scale / 2 = max(|x|) / (2 x (2^(bits-1) - 1))
```
**Why INT8 outperforms INT4:**
- INT8: 256 discrete levels — max relative error = 0.39% of range
- INT4: 16 discrete levels — max relative error = 6.25% of range
- Ratio: 16x more quantization error at INT4 vs INT8
**Compression ratio:**
```
CR = FP32_size / target_size = 4 bytes / target_bytes_per_value
```
INT8: CR = 4/1 = **4x** | INT4: CR = 4/0.5 = **8x** | INT2: CR = 4/0.25 = **16x**
"""),
})
return
# ── CELL 12: ACT I PREDICTION-VS-REALITY REVEAL ───────────────────────────────
@app.cell(hide_code=True)
def _(
mo, act1_prediction,
act1_model, act1_quant_scheme, act1_calib_size,
RESNET50_TOP1_ACC, MOBILENETV3_TOP1_ACC, VITBASE_TOP1_ACC,
):
_model_key = act1_model.value
_scheme = act1_quant_scheme.value
_calib = act1_calib_size.value
_ACC_LOOKUP = {
"resnet50": RESNET50_TOP1_ACC,
"mobilenetv3": MOBILENETV3_TOP1_ACC,
"vitbase": VITBASE_TOP1_ACC,
}
_base_acc = _ACC_LOOKUP[_model_key]
_calib_penalty = 0.25 if (_scheme == "ptq" and _calib < 256) else (0.10 if (_scheme == "ptq" and _calib < 512) else 0.0)
_qat_recovery = 0.35 if _scheme == "qat" else 0.0
_SENS3 = {"resnet50": 1.0, "mobilenetv3": 0.85, "vitbase": 1.35}
_s3 = _SENS3[_model_key]
_int8_drop = max(0.0, 0.50 * _s3 + _calib_penalty - _qat_recovery * 0.35)
_pred_val = act1_prediction.value
_PRED_BANDS = {
"A": (0.0, 0.0),
"B": (0.0, 0.1),
"C": (0.5, 2.0),
"D": (5.0, 10.0),
}
_lo, _hi = _PRED_BANDS[_pred_val]
_correct = _lo <= _int8_drop <= _hi
_FEEDBACK = {
"A": (
f"**Not quite.** INT8 is not mathematically equivalent to FP32. "
f"Quantization maps each floating-point weight to one of 256 discrete integer levels "
f"using `Q(x) = round(x/scale + zero_point)`. Every rounding is a real error. "
f"For {_model_key.upper()}, INT8 PTQ costs **{_int8_drop:.2f}% accuracy** — "
f"small, but nonzero and measurable."
),
"B": (
f"**Close, but the data disagrees.** INT8 is not lossless — it introduces "
f"rounding error bounded by `scale/2`. For ResNet-50, the INT8 drop is "
f"**{_int8_drop:.2f}%**, which is above the 0.1% threshold. "
f"QAT can bring it close to 0.1%, but standard PTQ will not."
),
"C": (
f"**Correct.** INT8 PTQ introduces a measurable but practically acceptable "
f"accuracy penalty. For {_model_key.upper()}, the actual drop is "
f"**{_int8_drop:.2f}%**. This falls squarely in the 0.52% range for PTQ with "
f"adequate calibration. The team lead was wrong that INT8 is lossless, "
f"but right that it is usable in production."
),
"D": (
f"**Not quite.** A 510% drop would make INT8 unusable, but that level of "
f"degradation is characteristic of INT2 or very aggressive INT4, not INT8. "
f"For {_model_key.upper()}, INT8 PTQ costs only **{_int8_drop:.2f}%** accuracy — "
f"enough to notice in A/B testing, but not enough to block deployment."
),
}
mo.callout(
mo.md(_FEEDBACK[_pred_val]),
kind="success" if _correct else "warn",
)
return
# ── CELL 13: ACT I REFLECTION ─────────────────────────────────────────────────
@app.cell(hide_code=True)
def _(mo):
act1_reflection = mo.ui.radio(
options={
"A) INT8 has more representable values — less rounding error in the linear mapping": "A",
"B) INT8 is only applied to weights, not activations, so errors cancel": "B",
"C) INT4 always uses non-uniform quantization which amplifies error": "C",
"D) INT8 activations are always exactly representable in hardware": "D",
},
label="Reflection: Why does INT8 preserve accuracy better than INT4 in uniform quantization?",
)
mo.vstack([
mo.md("---"),
mo.md("### Reflection — Act I"),
act1_reflection,
])
return (act1_reflection,)
# ── CELL 14: ACT I REFLECTION FEEDBACK ───────────────────────────────────────
@app.cell(hide_code=True)
def _(mo, act1_reflection):
mo.stop(
act1_reflection.value is None,
mo.callout(mo.md("Select an answer to see the explanation."), kind="warn"),
)
_REFL1 = {
"A": (
"**Correct.** INT8 has 256 representable levels; INT4 has only 16. "
"The quantization scale factor `scale = max(|x|) / (2^(bits-1) - 1)` "
"is 16x larger for INT4 than INT8 when the value range is the same. "
"Each rounding error is up to 16x larger, and those errors accumulate "
"through layers. The accuracy gap between INT8 and INT4 is fundamentally "
"an information-capacity gap.",
True,
),
"B": (
"**Not correct.** Modern quantization (PTQ and QAT) applies to both "
"weights and activations. Quantizing only weights would reduce memory "
"footprint but leave inference arithmetic in FP32, missing the full "
"latency benefit. The accuracy penalty comes from both domains.",
False,
),
"C": (
"**Not correct.** Standard INT4 quantization uses uniform mapping, just "
"like INT8. Non-uniform quantization (e.g., NF4 used in QLoRA) actually "
"improves accuracy by placing more bins near zero where values cluster. "
"The INT4 accuracy penalty is a direct consequence of having 16 levels "
"vs 256 — not a property of uniform vs non-uniform mapping.",
False,
),
"D": (
"**Not correct.** INT8 activations are not exactly representable — "
"they are the output of the same rounding that weights undergo. "
"An activation of 0.732 mapped to scale=0.006 becomes round(0.732/0.006) = 122, "
"which dequantizes to 0.732 plus-or-minus 0.003. The error exists; it is "
"bounded by `scale/2`.",
False,
),
}
_text, _correct = _REFL1[act1_reflection.value]
mo.callout(mo.md(_text), kind="success" if _correct else "warn")
return
# ═════════════════════════════════════════════════════════════════════════════
# ACT II — THE COMPRESSION TRADE-OFF FRONTIER
# ═════════════════════════════════════════════════════════════════════════════
# ── CELL 15: ACT II SCENARIO ──────────────────────────────────────────────────
@app.cell(hide_code=True)
def _(mo, COLORS):
_color = COLORS["Cloud"]
mo.vstack([
mo.md("---"),
mo.Html(f"""
<div style="background: linear-gradient(90deg, #0f172a, #1e293b);
padding: 10px 20px; border-radius: 8px; margin: 8px 0;">
<span style="font-size:0.72rem; font-weight:700; color:{_color};
text-transform:uppercase; letter-spacing:0.15em;">
Act II &middot; Design Challenge &middot; 20&ndash;25 min
</span>
<span style="font-size:1.2rem; font-weight:800; color:#f8fafc; margin-left:16px;">
The Compression Trade-off Frontier
</span>
</div>
"""),
mo.Html(f"""
<div style="border-left:4px solid {_color}; background:#f0f4ff;
border-radius:0 10px 10px 0; padding:16px 22px; margin:12px 0;">
<div style="font-size:0.72rem; font-weight:700; color:{_color};
text-transform:uppercase; letter-spacing:0.1em; margin-bottom:6px;">
Incoming Message &middot; Platform Engineering Lead
</div>
<div style="font-style:italic; font-size:1.0rem; color:#1e293b; line-height:1.65;">
"We are deploying LLaMA-3 8B as an on-device model across three mobile
tiers in our user base: Flagship (8 GB RAM), Mid-range (4 GB), and
Budget (2 GB). I need a different compression strategy for each tier.
FP32 is 32 GB — none of them can fit that. Design the compression stack
that keeps each tier as close to FP32 quality as possible within its
memory budget."
</div>
</div>
"""),
mo.Html("""
<div style="display:grid; grid-template-columns:repeat(3, 1fr); gap:14px; margin:16px 0;">
<div style="background:#f0fdf4; border:1px solid #bbf7d0; border-top:4px solid #008F45;
border-radius:8px; padding:14px;">
<div style="font-weight:800; color:#14532d; font-size:0.9rem; margin-bottom:4px;">
Flagship Tier
</div>
<div style="font-family:monospace; font-size:1.4rem; font-weight:900; color:#008F45;">
8 GB
</div>
<div style="font-size:0.8rem; color:#166534; margin-top:4px;">
Available RAM budget
</div>
</div>
<div style="background:#fff7ed; border:1px solid #fed7aa; border-top:4px solid #CC5500;
border-radius:8px; padding:14px;">
<div style="font-weight:800; color:#9a3412; font-size:0.9rem; margin-bottom:4px;">
Mid-range Tier
</div>
<div style="font-family:monospace; font-size:1.4rem; font-weight:900; color:#CC5500;">
4 GB
</div>
<div style="font-size:0.8rem; color:#7c2d12; margin-top:4px;">
Available RAM budget
</div>
</div>
<div style="background:#fef2f2; border:1px solid #fecaca; border-top:4px solid #CB202D;
border-radius:8px; padding:14px;">
<div style="font-weight:800; color:#991b1b; font-size:0.9rem; margin-bottom:4px;">
Budget Tier
</div>
<div style="font-family:monospace; font-size:1.4rem; font-weight:900; color:#CB202D;">
2 GB
</div>
<div style="font-size:0.8rem; color:#7f1d1d; margin-top:4px;">
Available RAM budget
</div>
</div>
</div>
"""),
mo.callout(mo.md("""
**First introduction: Compression Trade-off Frontier** — This instrument
(@sec-optimizations-model-compression) plots every compression configuration
as a point in (model size, quality) space, then highlights the Pareto frontier:
the set of configurations where you cannot improve quality without increasing size,
or reduce size without hurting quality. Your goal is to select the Pareto-optimal
configuration for each deployment tier.
"""), kind="info"),
])
return
# ── CELL 16: ACT II PREDICTION LOCK ───────────────────────────────────────────
@app.cell(hide_code=True)
def _(mo):
act2_prediction = mo.ui.radio(
options={
"A) Use INT8 for all three tiers — INT8 is the universal safe choice": "A",
"B) Flagship: INT8, Mid-range: INT4, Budget: INT4 + 50% structured pruning": "B",
"C) 50% unstructured pruning for all tiers — pruning is always better than quantization": "C",
"D) Distill a separate small model for budget tier — quantization never works below 4 bits": "D",
},
label=(
"LLaMA-3 8B, FP32 = 32 GB. Design the compression strategy for Flagship (8 GB), "
"Mid-range (4 GB), and Budget (2 GB) tiers that maximizes quality within each budget."
),
)
mo.vstack([
mo.Html("""
<div style="background: #1e293b; border-radius: 12px; padding: 20px;
border-left: 4px solid #6366f1; margin: 8px 0;">
<div style="font-size: 0.72rem; font-weight: 700; color: #a5b4fc;
text-transform: uppercase; letter-spacing: 0.1em; margin-bottom: 10px;">
Prediction Lock — Act II
</div>
<div style="color: #e2e8f0; font-size: 0.88rem; margin-bottom: 12px;">
Commit your strategy prediction before exploring the Frontier.
</div>
</div>
"""),
act2_prediction,
])
return (act2_prediction,)
# ── CELL 17: ACT II GATE ──────────────────────────────────────────────────────
@app.cell(hide_code=True)
def _(mo, act2_prediction):
mo.stop(
act2_prediction.value is None,
mo.callout(
mo.md("Select your strategy prediction above to unlock the Compression Frontier."),
kind="warn",
),
)
return
# ── CELL 18: ACT II CONTROLS ──────────────────────────────────────────────────
@app.cell(hide_code=True)
def _(mo):
_COMPRESS_OPTIONS = {
"FP32 (32 GB — 1x compression)": "fp32",
"FP16 (16 GB — 2x compression)": "fp16",
"INT8 (8 GB — 4x compression)": "int8",
"INT4 (4 GB — 8x compression)": "int4",
"INT4 + 10% structured pruning (~3.6 GB)": "int4_prune10",
"INT4 + 30% structured pruning (~2.8 GB)": "int4_prune30",
"INT4 + 50% structured pruning (~2.0 GB)": "int4_prune50",
"INT4 + 50% unstructured pruning (~2.0 GB)": "int4_prune50_unstruct",
"Distilled 4B model, INT8 (~4 GB)": "distil_4b_int8",
"Distilled 1B model, INT8 (~1 GB)": "distil_1b_int8",
}
act2_flagship = mo.ui.dropdown(
options=_COMPRESS_OPTIONS,
value="INT8 (8 GB — 4x compression)",
label="Flagship tier (8 GB budget)",
)
act2_midrange = mo.ui.dropdown(
options=_COMPRESS_OPTIONS,
value="INT4 (4 GB — 8x compression)",
label="Mid-range tier (4 GB budget)",
)
act2_budget = mo.ui.dropdown(
options=_COMPRESS_OPTIONS,
value="INT4 + 50% structured pruning (~2.0 GB)",
label="Budget tier (2 GB budget)",
)
mo.vstack([
mo.md("### Assign a Compression Strategy to Each Deployment Tier"),
mo.md("""
Select a compression configuration for each tier. The Compression Trade-off
Frontier below will update to show where each choice sits relative to the
Pareto-optimal boundary.
"""),
mo.hstack([act2_flagship, act2_midrange, act2_budget], justify="start", gap="1.5rem"),
])
return (act2_flagship, act2_midrange, act2_budget)
# ── CELL 19: COMPRESSION FRONTIER PLOT + TIER METRICS ─────────────────────────
@app.cell(hide_code=True)
def _(
mo, go, COLORS,
act2_flagship, act2_midrange, act2_budget,
LLAMA3_8B_PPL,
):
# ── LLaMA-3 8B compression data ───────────────────────────────────────────
# Source: @sec-optimizations-model-compression — empirical compression curves.
# Perplexity (lower = better): FP32 baseline = 6.14 (WikiText-2).
# Calibrated to published llama.cpp / GGUF / bitsandbytes benchmarks.
#
# Tuple structure: (label, method_key, size_gb, perplexity, pareto_flag)
_CONFIGS = [
("FP32 (baseline)", "fp32", 32.0, 6.14, True ),
("FP16", "fp16", 16.0, 6.16, True ),
("INT8", "int8", 8.0, 6.21, True ),
("INT4", "int4", 4.0, 6.47, True ),
("INT4 + 10% struct. pruning", "int4_prune10", 3.6, 6.63, True ),
("INT4 + 30% struct. pruning", "int4_prune30", 2.8, 7.12, True ),
("INT4 + 50% struct. pruning", "int4_prune50", 2.0, 8.05, True ),
("INT4 + 50% unstruct. pruning", "int4_prune50_unstruct", 2.0, 7.85, False),
("Distilled 4B, INT8", "distil_4b_int8", 4.1, 7.30, False),
("Distilled 1B, INT8", "distil_1b_int8", 1.0, 9.80, False),
]
_CONFIG_MAP = {c[1]: c for c in _CONFIGS}
_TIER_BUDGETS = {"flagship": 8.0, "midrange": 4.0, "budget": 2.0}
_TIER_KEYS = {
"flagship": act2_flagship.value,
"midrange": act2_midrange.value,
"budget": act2_budget.value,
}
_TIER_COLORS = {
"flagship": COLORS["GreenLine"],
"midrange": COLORS["OrangeLine"],
"budget": COLORS["RedLine"],
}
_TIER_LABELS = {"flagship": "Flagship", "midrange": "Mid-range", "budget": "Budget"}
_bg_x, _bg_y, _bg_text = [], [], []
_pf_x, _pf_y, _pf_text = [], [], []
for _label_c, _key_c, _sz_c, _ppl_c, _on_pareto_c in _CONFIGS:
if _on_pareto_c:
_pf_x.append(_sz_c)
_pf_y.append(_ppl_c)
_pf_text.append(_label_c)
else:
_bg_x.append(_sz_c)
_bg_y.append(_ppl_c)
_bg_text.append(_label_c)
_pf_sorted = sorted(zip(_pf_x, _pf_y, _pf_text))
_pf_x_s = [p[0] for p in _pf_sorted]
_pf_y_s = [p[1] for p in _pf_sorted]
_fig2 = go.Figure()
_fig2.add_trace(go.Scatter(
x=_bg_x, y=_bg_y,
mode="markers",
name="Dominated (off-frontier)",
marker=dict(color="#94a3b8", size=10, symbol="circle-open", line=dict(width=2)),
text=_bg_text,
hovertemplate="<b>%{text}</b><br>Size: %{x:.1f} GB<br>Perplexity: %{y:.2f}<extra></extra>",
))
_fig2.add_trace(go.Scatter(
x=_pf_x_s, y=_pf_y_s,
mode="lines",
name="Pareto frontier",
line=dict(color=COLORS["BlueLine"], width=2, dash="dot"),
showlegend=True,
hoverinfo="skip",
))
_fig2.add_trace(go.Scatter(
x=_pf_x, y=_pf_y,
mode="markers",
name="Pareto-optimal",
marker=dict(color=COLORS["BlueLine"], size=11, symbol="circle",
line=dict(color="white", width=2)),
text=_pf_text,
hovertemplate="<b>%{text}</b><br>Size: %{x:.1f} GB<br>Perplexity: %{y:.2f}<extra></extra>",
))
for _tier_n, _budget_gb in _TIER_BUDGETS.items():
_fig2.add_vline(
x=_budget_gb,
line_color=_TIER_COLORS[_tier_n],
line_width=1.5,
line_dash="dash",
annotation_text=f"{_TIER_LABELS[_tier_n]} ({_budget_gb:.0f} GB)",
annotation_font_color=_TIER_COLORS[_tier_n],
annotation_position="top",
)
for _tier_n2, _sel_key in _TIER_KEYS.items():
if _sel_key in _CONFIG_MAP:
_c2 = _CONFIG_MAP[_sel_key]
_fig2.add_trace(go.Scatter(
x=[_c2[2]], y=[_c2[3]],
mode="markers+text",
name=f"{_TIER_LABELS[_tier_n2]} selection",
marker=dict(color=_TIER_COLORS[_tier_n2], size=18,
symbol="star", line=dict(color="white", width=2)),
text=[_TIER_LABELS[_tier_n2]],
textposition="top center",
textfont=dict(size=11, color=_TIER_COLORS[_tier_n2]),
hovertemplate=(
f"<b>{_TIER_LABELS[_tier_n2]}: {_c2[0]}</b>"
f"<br>Size: {_c2[2]:.1f} GB<br>Perplexity: {_c2[3]:.2f}"
f"<extra></extra>"
),
))
_fig2.update_layout(
xaxis=dict(
title="Model Size (GB)",
type="log",
tickvals=[1, 2, 4, 8, 16, 32],
ticktext=["1 GB", "2 GB", "4 GB", "8 GB", "16 GB", "32 GB"],
gridcolor="#f1f5f9", linecolor=COLORS["Border"],
range=[-0.05, 1.55],
),
yaxis=dict(
title="Perplexity on WikiText-2 (lower = better)",
gridcolor="#f1f5f9", linecolor=COLORS["Border"],
range=[5.8, 10.5],
),
height=500,
plot_bgcolor="white",
paper_bgcolor="white",
font_family="Inter, sans-serif",
font_color=COLORS["Text"],
margin=dict(l=60, r=40, t=30, b=60),
legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="left", x=0),
)
# ── Per-tier metric cards ──────────────────────────────────────────────────
_tier_cards_html = ""
for _tier_n3, _sel_key3 in _TIER_KEYS.items():
_budget3 = _TIER_BUDGETS[_tier_n3]
_accent3 = _TIER_COLORS[_tier_n3]
if _tier_n3 == "flagship":
_bg3, _bd3 = "#f0fdf4", "#bbf7d0"
elif _tier_n3 == "midrange":
_bg3, _bd3 = "#fff7ed", "#fed7aa"
else:
_bg3, _bd3 = "#fef2f2", "#fecaca"
if _sel_key3 in _CONFIG_MAP:
_c3 = _CONFIG_MAP[_sel_key3]
_sz3 = _c3[2]
_ppl3 = _c3[3]
_pareto3 = _c3[4]
_fits3 = _sz3 <= _budget3
_ppl_delta3 = _ppl3 - LLAMA3_8B_PPL
_ppl_col3 = "#008F45" if _ppl_delta3 < 0.3 else ("#CC5500" if _ppl_delta3 < 1.5 else "#CB202D")
_fit_str3 = f"{_sz3:.1f} GB (fits)" if _fits3 else f"{_sz3:.1f} GB — EXCEEDS {_budget3:.0f} GB"
_fit_col3 = "#008F45" if _fits3 else "#CB202D"
_pb_badge = (
'<span style="background:#f0fdf4; border:1px solid #bbf7d0; padding:1px 7px; '
'border-radius:4px; font-weight:700; color:#008F45; font-size:0.72rem;">Pareto-optimal</span>'
if _pareto3 else
'<span style="background:#fef2f2; border:1px solid #fecaca; padding:1px 7px; '
'border-radius:4px; font-weight:700; color:#CB202D; font-size:0.72rem;">Off-frontier</span>'
)
else:
_sz3, _ppl3, _ppl_delta3 = 0.0, 0.0, 0.0
_fit_str3, _fit_col3, _ppl_col3, _pb_badge = "Unknown", "#94a3b8", "#94a3b8", ""
_tier_cards_html += (
f'<div style="background:{_bg3}; border:1px solid {_bd3}; border-top:4px solid {_accent3};'
f'border-radius:8px; padding:14px 16px; flex:1; min-width:180px;">'
f'<div style="font-weight:800; color:{_accent3}; font-size:0.85rem; margin-bottom:8px;">'
f'{_TIER_LABELS[_tier_n3]} ({_budget3:.0f} GB budget)</div>'
f'<div style="font-size:0.82rem; line-height:1.9;">'
f'<div><span style="color:#475569; font-weight:600;">Strategy:</span> '
f'<span style="font-family:monospace; color:#0f172a;">{_sel_key3}</span></div>'
f'<div><span style="color:#475569; font-weight:600;">Size:</span> '
f'<span style="font-family:monospace; color:{_fit_col3}; font-weight:700;">{_fit_str3}</span></div>'
f'<div><span style="color:#475569; font-weight:600;">Perplexity:</span> '
f'<span style="font-family:monospace; color:{_ppl_col3}; font-weight:700;">'
f'{_ppl3:.2f} (+{_ppl_delta3:.2f} vs FP32)</span></div>'
f'<div style="margin-top:4px;">{_pb_badge}</div>'
f'</div></div>'
)
mo.vstack([
mo.md("### Compression Trade-off Frontier — LLaMA-3 8B"),
mo.md("""
Each point is a compression configuration. **Blue dots** lie on the Pareto
frontier — where you cannot improve quality without increasing size.
**Star markers** show your selections. Dashed vertical lines mark each
tier's memory budget.
"""),
mo.plotly(_fig2),
mo.md("#### Per-Tier Metric Summary"),
mo.Html(f'<div style="display:flex; gap:14px; flex-wrap:wrap; margin:12px 0;">{_tier_cards_html}</div>'),
])
return
# ── CELL 20: FAILURE STATE (OOM DETECTION) ────────────────────────────────────
@app.cell(hide_code=True)
def _(mo, act2_flagship, act2_midrange, act2_budget):
_SIZE_MAP_OOM = {
"fp32": 32.0, "fp16": 16.0, "int8": 8.0, "int4": 4.0,
"int4_prune10": 3.6, "int4_prune30": 2.8, "int4_prune50": 2.0,
"int4_prune50_unstruct": 2.0, "distil_4b_int8": 4.1, "distil_1b_int8": 1.0,
}
_BUDGETS_OOM = {"flagship": 8.0, "midrange": 4.0, "budget": 2.0}
_TIERS_OOM = {
"flagship": act2_flagship.value,
"midrange": act2_midrange.value,
"budget": act2_budget.value,
}
_TIER_LABELS_OOM = {"flagship": "Flagship", "midrange": "Mid-range", "budget": "Budget"}
_violations = []
for _tier_oom, _key_oom in _TIERS_OOM.items():
_sz_oom = _SIZE_MAP_OOM.get(_key_oom, 0.0)
if _sz_oom > _BUDGETS_OOM[_tier_oom]:
_violations.append((_tier_oom, _sz_oom, _BUDGETS_OOM[_tier_oom]))
_widgets_oom = []
for _tier_v, _req_v, _avail_v in _violations:
_widgets_oom.append(
mo.callout(
mo.md(
f"**OOM — Infeasible for {_TIER_LABELS_OOM[_tier_v]} tier.** "
f"Required: **{_req_v:.1f} GB** | Available: **{_avail_v:.0f} GB** | "
f"Overflow: **{_req_v - _avail_v:.1f} GB over budget.** "
f"Select a more aggressive compression scheme for this tier."
),
kind="danger",
)
)
if _widgets_oom:
mo.vstack(_widgets_oom)
else:
mo.callout(
mo.md(
"**All tiers within budget.** "
"Every selected configuration fits within its deployment tier memory limit."
),
kind="success",
)
return
# ── CELL 21: ACT II PREDICTION FEEDBACK ───────────────────────────────────────
@app.cell(hide_code=True)
def _(mo, act2_prediction):
_pred2 = act2_prediction.value
_FEEDBACK_A2 = {
"A": (
"**Not quite.** INT8 gives an 8 GB footprint for LLaMA-3 8B — exactly the "
"Flagship budget. But the Mid-range tier has only 4 GB and the Budget tier "
"only 2 GB. A single INT8 strategy violates both smaller tiers. You need a "
"tiered approach that applies progressively stronger compression as the budget "
"shrinks.",
False,
),
"B": (
"**Correct.** This is the Pareto-optimal tiered allocation. "
"Flagship at INT8 uses the full 8 GB budget with minimal accuracy loss. "
"Mid-range at INT4 hits exactly the 4 GB constraint. "
"Budget at INT4 + 50% structured pruning reaches the 2 GB ceiling while "
"remaining on the Pareto frontier — structured pruning removes entire "
"attention heads and MLP blocks, so the compressed model still runs "
"efficiently on dense hardware.",
True,
),
"C": (
"**Not correct.** Unstructured pruning sets individual weights to zero but "
"leaves the tensor dimensions unchanged. Dense matrix kernels on NPUs execute "
"the same number of MAC operations regardless of how many are zero — the "
"hardware does not skip zeros. The result is a compressed file but not a "
"faster computation. Structured pruning removes entire rows/columns, which "
"genuinely reduces the arithmetic and fits the model into a smaller budget.",
False,
),
"D": (
"**Not correct.** Knowledge distillation is a valid axis, but it is not required "
"here. INT4 + structured pruning reaches 2 GB for LLaMA-3 8B while remaining on "
"the Pareto frontier. Distilling a separate 1B model produces a fundamentally "
"different model with different capabilities — appropriate only when quality "
"degradation from pruning is unacceptable.",
False,
),
}
_text2, _correct2 = _FEEDBACK_A2[_pred2]
mo.callout(mo.md(_text2), kind="success" if _correct2 else "warn")
return
# ── CELL 22: ACT II MATHPEEK ──────────────────────────────────────────────────
@app.cell(hide_code=True)
def _(mo):
mo.accordion({
"The governing equations — Compression Ratio and Pareto Frontier": mo.md("""
**Compression ratio** (@sec-optimizations-model-compression):
```
CR = Size_FP32 / Size_compressed
= (N_params x 4 bytes) / (N_params x bytes_per_value x (1 - sparsity))
```
For INT8 + 0% pruning: CR = 4 / 1.0 = **4x**
For INT4 + 50% structured pruning: CR = 4 / (0.5 x 0.5) = **16x**
**Quantization error bound (uniform per-tensor):**
```
|epsilon_Q| <= max(|x|) / (2 x (2^bits - 1))
```
INT8: max_err = range / 510 — approximately 0.2% of range
INT4: max_err = range / 30 — approximately 3.3% of range
**Structured vs unstructured pruning:**
```
Structured: removes complete rows/columns -> dense submatrix -> hardware efficient
Unstructured: zeros individual elements -> sparse matrix -> dense kernel unchanged
```
Structured pruning at sparsity `s` reduces MACs by exactly `s`:
```
MACs_pruned = MACs_dense x (1 - sparsity)
```
Unstructured pruning at sparsity `s` reduces latency ONLY when specialized
sparse kernels are available (e.g., NVIDIA A100 2:4 sparsity). Without hardware
support, latency is **unchanged** even at 90% sparsity.
**Empirical accuracy-size tradeoff law:**
```
delta_perplexity ≈ alpha x log2(CR)
```
where alpha ≈ 0.12 for LLaMA-class models at moderate compression ratios.
This log relationship explains why compression becomes increasingly costly
as you push toward extreme ratios (INT2, very high sparsity).
"""),
})
return
# ── CELL 23: ACT II REFLECTION ────────────────────────────────────────────────
@app.cell(hide_code=True)
def _(mo):
act2_reflection = mo.ui.radio(
options={
"A) Pruning removes too many weights, making the model too small to function": "A",
"B) Sparse operations require special hardware support — dense kernels execute zero weights unchanged": "B",
"C) Pruned models cannot subsequently be quantized": "C",
"D) Unstructured pruning always hurts accuracy more than quantization": "D",
},
label="Reflection: Why is unstructured pruning often hardware-inefficient in practice?",
)
mo.vstack([
mo.md("---"),
mo.md("### Reflection — Act II"),
act2_reflection,
])
return (act2_reflection,)
# ── CELL 24: ACT II REFLECTION FEEDBACK ──────────────────────────────────────
@app.cell(hide_code=True)
def _(mo, act2_reflection):
mo.stop(
act2_reflection.value is None,
mo.callout(mo.md("Select an answer to see the explanation."), kind="warn"),
)
_REFL2 = {
"A": (
"**Not correct.** Pruning does not make a model too small to function — "
"it creates sparse weights within the original tensor dimensions. The model "
"architecture is unchanged; only individual weight values are forced to zero. "
"The hardware-efficiency problem is not about model capacity; it is about "
"whether the underlying arithmetic can exploit those zeros.",
False,
),
"B": (
"**Correct.** Dense matrix multiply kernels (GEMM) on GPUs and NPUs are "
"designed for dense inputs. A kernel computing `C = A x B` iterates over "
"every element of A — including the zeros introduced by unstructured pruning. "
"The operation count is identical to the unpruned case. Memory bandwidth "
"savings require the weights to be stored sparsely (e.g., CSR format), but "
"even that requires a sparse GEMM kernel. NVIDIA A100 supports 2:4 structured "
"sparsity natively; arbitrary unstructured sparsity on mobile NPUs typically "
"provides zero latency benefit.",
True,
),
"C": (
"**Not correct.** Pruning and quantization are orthogonal techniques. "
"A pruned model — whether structured or unstructured — can be quantized "
"afterward. INT4 + structured pruning is a standard production combination "
"precisely because each technique acts on a different aspect of the model "
"(precision vs. architectural width).",
False,
),
"D": (
"**Not correct.** The accuracy impact of unstructured pruning depends heavily "
"on sparsity level and model type. At moderate sparsities (1030%), unstructured "
"pruning often hurts accuracy less than INT4 quantization. The problem is not "
"accuracy — it is that you cannot exploit the sparsity for latency improvement "
"without specialized sparse kernels. The hardware efficiency problem is "
"independent of the accuracy impact.",
False,
),
}
_text_r2, _correct_r2 = _REFL2[act2_reflection.value]
mo.callout(mo.md(_text_r2), kind="success" if _correct_r2 else "warn")
return
# ═════════════════════════════════════════════════════════════════════════════
# LEDGER SAVE + HUD FOOTER
# ═════════════════════════════════════════════════════════════════════════════
# ── CELL 25: LEDGER SAVE + HUD ────────────────────────────────────────────────
@app.cell(hide_code=True)
def _(
mo, ledger, COLORS,
context_toggle,
act1_prediction,
act2_flagship, act2_midrange, act2_budget,
LLAMA3_8B_FP32_GB,
):
_ctx_hud = context_toggle.value
_a1_pred_hud = act1_prediction.value or "unanswered"
_a1_correct_hud = _a1_pred_hud == "C"
_SIZE_MAP_HUD = {
"fp32": 32.0, "fp16": 16.0, "int8": 8.0, "int4": 4.0,
"int4_prune10": 3.6, "int4_prune30": 2.8, "int4_prune50": 2.0,
"int4_prune50_unstruct": 2.0, "distil_4b_int8": 4.1, "distil_1b_int8": 1.0,
}
_PARETO_HUD = {
"fp32": True, "fp16": True, "int8": True, "int4": True,
"int4_prune10": True, "int4_prune30": True, "int4_prune50": True,
"int4_prune50_unstruct": False, "distil_4b_int8": False, "distil_1b_int8": False,
}
_BUDGETS_HUD = {"flagship": 8.0, "midrange": 4.0, "budget": 2.0}
_SELECTED_HUD = {
"flagship": act2_flagship.value,
"midrange": act2_midrange.value,
"budget": act2_budget.value,
}
_flagship_gb_hud = _SIZE_MAP_HUD.get(_SELECTED_HUD["flagship"], 0.0)
_midrange_gb_hud = _SIZE_MAP_HUD.get(_SELECTED_HUD["midrange"], 0.0)
_budget_gb_hud = _SIZE_MAP_HUD.get(_SELECTED_HUD["budget"], 0.0)
_constraint_hit_hud = (
_flagship_gb_hud > 8.0 or _midrange_gb_hud > 4.0 or _budget_gb_hud > 2.0
)
_compression_method_hud = _SELECTED_HUD["flagship"]
_compression_ratio_hud = LLAMA3_8B_FP32_GB / max(_flagship_gb_hud, 0.01)
_pareto_optimal_hud = all(
_PARETO_HUD.get(k, False) for k in _SELECTED_HUD.values()
)
ledger.save(
chapter=10,
design={
"context": _ctx_hud,
"compression_method": _compression_method_hud,
"compression_ratio": round(_compression_ratio_hud, 2),
"act1_prediction": _a1_pred_hud,
"act1_correct": _a1_correct_hud,
"act2_result": _budget_gb_hud,
"act2_decision": (
f"flagship={_SELECTED_HUD['flagship']};"
f"mid={_SELECTED_HUD['midrange']};"
f"budget={_SELECTED_HUD['budget']}"
),
"constraint_hit": _constraint_hit_hud,
"pareto_optimal": _pareto_optimal_hud,
},
)
# ── HUD color coding ──────────────────────────────────────────────────────
_green = "#4ade80"
_red = "#f87171"
_yellow = "#fbbf24"
_muted = "#94a3b8"
_a1_icon = _green if _a1_correct_hud else _yellow
_oom_icon = _red if _constraint_hit_hud else _green
_pf_icon = _green if _pareto_optimal_hud else _yellow
mo.vstack([
mo.md("---"),
mo.Html(f"""
<div style="background:#0f172a; border-radius:12px; padding:16px 28px;
margin-top:24px; border:1px solid #1e293b;
font-family:'SF Mono', 'Fira Code', monospace; font-size:0.8rem;">
<div style="color:#475569; font-size:0.68rem; font-weight:700;
text-transform:uppercase; letter-spacing:0.12em; margin-bottom:12px;">
Design Ledger &mdash; Chapter 10 Saved
</div>
<div style="display:flex; gap:32px; flex-wrap:wrap; align-items:center;">
<div>
<span style="color:{_muted}; font-weight:600;">CONTEXT</span>
&nbsp;<span style="color:#e2e8f0;">{_ctx_hud.upper()}</span>
</div>
<div>
<span style="color:{_muted}; font-weight:600;">ACT I PREDICTION</span>
&nbsp;<span style="color:{_a1_icon};">
{_a1_pred_hud} &mdash; {'CORRECT' if _a1_correct_hud else 'INCORRECT'}
</span>
</div>
<div>
<span style="color:{_muted}; font-weight:600;">COMPRESSION</span>
&nbsp;<span style="color:#e2e8f0;">
{_compression_method_hud} ({_compression_ratio_hud:.1f}x)
</span>
</div>
<div>
<span style="color:{_muted}; font-weight:600;">OOM HIT</span>
&nbsp;<span style="color:{_oom_icon};">
{'YES' if _constraint_hit_hud else 'NO'}
</span>
</div>
<div>
<span style="color:{_muted}; font-weight:600;">PARETO-OPTIMAL</span>
&nbsp;<span style="color:{_pf_icon};">
{'YES' if _pareto_optimal_hud else 'NO'}
</span>
</div>
<div>
<span style="color:{_muted}; font-weight:600;">BUDGET TIER</span>
&nbsp;<span style="color:#e2e8f0;">
{_budget_gb_hud:.1f} GB / 2 GB limit
</span>
</div>
</div>
</div>
"""),
mo.callout(
mo.md(
"**Lab 10 complete.** Your compression decisions are saved to the Design Ledger "
"and will be referenced in Lab 11 (Hardware Acceleration — Roofline Model), "
"where you will compute the arithmetic intensity of your compressed model and "
"see where it falls relative to the memory bandwidth and compute ceilings."
),
kind="success" if (not _constraint_hit_hud and _pareto_optimal_hud) else "info",
),
])
return
if __name__ == "__main__":
app.run()