mirror of
https://github.com/harvard-edge/cs249r_book.git
synced 2026-04-30 09:38:38 -05:00
Add all Vol1 (labs 01-16) and Vol2 (labs 01-17) interactive Marimo labs as the first full first-pass implementation of the ML Systems curriculum labs. Each lab follows the PROTOCOL 2-Act structure (35-40 min): - Act I: Calibration with prediction lock → instruments → overlay - Act II: Design challenge with failure states and reflection Key pedagogical instruments introduced progressively: - Vol1: D·A·M Triad, Iron Law, Memory Ledger, Roofline, Amdahl's Law, Little's Law, P99 Histogram, Compression Frontier, Chouldechova theorem - Vol2: NVLink vs PCIe cliff, Bisection BW, Young-Daly T*, Parallelism Paradox, AllReduce ring vs tree, KV-cache model, Jevons Paradox, DP ε-δ tradeoff, SLO composition, Adversarial Pareto, two-volume synthesis capstone All 35 staged files pass AST syntax verification (36/36 including lab_00). Also includes: - labs/LABS_SPEC.md: authoritative sub-agent brief for all lab conventions - labs/core/style.py: expanded unified design system with semantic color tokens
1650 lines
74 KiB
Python
1650 lines
74 KiB
Python
import marimo
|
||
|
||
__generated_with = "0.19.6"
|
||
app = marimo.App(width="full")
|
||
|
||
# ─────────────────────────────────────────────────────────────────────────────
|
||
# LAB 10: THE COMPRESSION FRONTIER
|
||
#
|
||
# Chapter: ML Optimizations — Model Compression (@sec-optimizations-model-compression)
|
||
# Core Invariant:
|
||
# Model compression is NOT free. Every technique moves on a Pareto frontier
|
||
# between model quality and resource savings. INT8 quantization gives ~4× memory
|
||
# reduction with <1% accuracy drop; INT4 gives ~8× with 2–5% drop; unstructured
|
||
# pruning is unpredictable on real hardware because dense kernels ignore zeros.
|
||
#
|
||
# 2 Contexts: Cloud (H100, 80 GB) vs Mobile (NPU, 8 GB)
|
||
#
|
||
# Act I — The Quantization Surprise (12–15 min)
|
||
# Stakeholder: Mobile App Team Lead
|
||
# Prediction: Is INT8 quantization "lossless"?
|
||
# Instrument: Quantization impact table — memory, accuracy, latency, energy
|
||
# across FP32 / FP16 / INT8 / INT4 / INT2 for selectable model.
|
||
# Reveal: Prediction-vs-reality overlay showing actual accuracy drop.
|
||
# Reflection: Why does INT8 preserve accuracy better than INT4?
|
||
#
|
||
# Act II — The Compression Trade-off Frontier (FIRST INTRODUCTION, 20–25 min)
|
||
# Stakeholder: Platform Engineering Lead
|
||
# Prediction: Best compression strategy for 3 mobile deployment tiers
|
||
# Instrument: Compression Trade-off Frontier — Pareto scatter (size vs quality)
|
||
# + tier budget dropdowns + metric cards.
|
||
# Failure state: OOM danger callout when selection exceeds tier memory budget.
|
||
# Reflection: Why is unstructured pruning hardware-inefficient?
|
||
#
|
||
# Design Ledger: chapter=10, context, compression_method, compression_ratio,
|
||
# act1_prediction, act1_correct, act2_result, act2_decision,
|
||
# constraint_hit, pareto_optimal
|
||
# ─────────────────────────────────────────────────────────────────────────────
|
||
|
||
|
||
# ── CELL 0: SETUP (hide_code=False — leave visible for instructor inspection) ─
|
||
@app.cell
|
||
def _():
|
||
import marimo as mo
|
||
import sys
|
||
import math
|
||
from pathlib import Path
|
||
import plotly.graph_objects as go
|
||
import numpy as np
|
||
from plotly.subplots import make_subplots
|
||
|
||
_root = Path(__file__).resolve().parents[2]
|
||
if str(_root) not in sys.path:
|
||
sys.path.insert(0, str(_root))
|
||
|
||
from labs.core.state import DesignLedger
|
||
from labs.core.style import COLORS, LAB_CSS, apply_plotly_theme
|
||
|
||
# ── Hardware constants (LABS_SPEC.md / NVIDIA and Apple specs) ────────────
|
||
H100_BW_GBS = 3350 # GB/s — H100 SXM5 HBM3e bandwidth
|
||
H100_TFLOPS_FP16 = 1979 # TFLOPS FP16 tensor core peak
|
||
H100_TFLOPS_INT8 = 3958 # TOPS INT8 tensor core (2× FP16)
|
||
H100_RAM_GB = 80 # GB HBM3e capacity
|
||
H100_TDP_W = 700 # Watts TDP
|
||
|
||
MOBILE_BW_GBS = 68 # GB/s — Apple A17-class SoC
|
||
MOBILE_TOPS_INT8 = 35 # TOPS INT8 NPU
|
||
MOBILE_RAM_GB = 8 # GB total unified memory
|
||
MOBILE_TDP_W = 5 # Watts sustained (thermal throttle ceiling)
|
||
|
||
# ── Bytes per value for each numeric format ────────────────────────────────
|
||
# Source: IEEE 754 / INT quantization: FP32=4B, FP16=2B, INT8=1B, INT4=0.5B, INT2=0.25B
|
||
DTYPE_BYTES = {
|
||
"fp32": 4.0,
|
||
"fp16": 2.0,
|
||
"int8": 1.0,
|
||
"int4": 0.5,
|
||
"int2": 0.25,
|
||
}
|
||
|
||
# ── ResNet-50 reference parameters ────────────────────────────────────────
|
||
# Source: @sec-optimizations-model-compression — canonical numbers
|
||
RESNET50_PARAMS_M = 25.6 # million parameters
|
||
RESNET50_FP32_MB = 98.0 # MB in FP32 (25.6M × 4B ≈ 102 MB, with overhead)
|
||
RESNET50_TOP1_ACC = 76.1 # % ImageNet top-1 (torchvision baseline)
|
||
|
||
# ── MobileNetV3-Large reference ───────────────────────────────────────────
|
||
MOBILENETV3_PARAMS_M = 5.4 # million parameters
|
||
MOBILENETV3_FP32_MB = 21.1 # MB in FP32
|
||
MOBILENETV3_TOP1_ACC = 75.8 # % ImageNet top-1
|
||
|
||
# ── ViT-Base/16 reference ─────────────────────────────────────────────────
|
||
VITBASE_PARAMS_M = 86.0 # million parameters
|
||
VITBASE_FP32_MB = 330.0 # MB in FP32
|
||
VITBASE_TOP1_ACC = 81.1 # % ImageNet top-1
|
||
|
||
# ── LLaMA-3 8B reference ──────────────────────────────────────────────────
|
||
LLAMA3_8B_PARAMS_B = 8.0 # billion parameters
|
||
LLAMA3_8B_FP32_GB = 32.0 # GB in FP32 (8B × 4B)
|
||
LLAMA3_8B_PPL = 6.14 # perplexity on WikiText-2 (FP32 baseline)
|
||
|
||
ledger = DesignLedger()
|
||
return (
|
||
mo, go, np, math, make_subplots,
|
||
ledger, COLORS, LAB_CSS, apply_plotly_theme,
|
||
H100_BW_GBS, H100_TFLOPS_FP16, H100_TFLOPS_INT8, H100_RAM_GB, H100_TDP_W,
|
||
MOBILE_BW_GBS, MOBILE_TOPS_INT8, MOBILE_RAM_GB, MOBILE_TDP_W,
|
||
DTYPE_BYTES,
|
||
RESNET50_PARAMS_M, RESNET50_FP32_MB, RESNET50_TOP1_ACC,
|
||
MOBILENETV3_PARAMS_M, MOBILENETV3_FP32_MB, MOBILENETV3_TOP1_ACC,
|
||
VITBASE_PARAMS_M, VITBASE_FP32_MB, VITBASE_TOP1_ACC,
|
||
LLAMA3_8B_PARAMS_B, LLAMA3_8B_FP32_GB, LLAMA3_8B_PPL,
|
||
)
|
||
|
||
|
||
# ── CELL 1: HEADER ────────────────────────────────────────────────────────────
|
||
@app.cell(hide_code=True)
|
||
def _(mo, LAB_CSS, COLORS):
|
||
_mobile_color = COLORS["Mobile"]
|
||
_cloud_color = COLORS["Cloud"]
|
||
mo.vstack([
|
||
LAB_CSS,
|
||
mo.Html(f"""
|
||
<div style="background: linear-gradient(135deg, #0f172a 0%, #1e293b 100%);
|
||
padding: 36px 44px; border-radius: 16px; color: white;
|
||
box-shadow: 0 8px 32px rgba(0,0,0,0.3);">
|
||
<div style="font-size: 0.72rem; font-weight: 700; letter-spacing: 0.18em;
|
||
color: #475569; text-transform: uppercase; margin-bottom: 10px;">
|
||
Machine Learning Systems · Volume I · Lab 10
|
||
</div>
|
||
<h1 style="margin: 0 0 10px 0; font-size: 2.4rem; font-weight: 900;
|
||
color: #f8fafc; line-height: 1.1; letter-spacing: -0.02em;">
|
||
The Compression Frontier
|
||
</h1>
|
||
<p style="margin: 0 0 20px 0; font-size: 1.05rem; color: #94a3b8;
|
||
max-width: 700px; line-height: 1.65;">
|
||
Every compression technique trades model quality for resource savings.
|
||
INT8 quantization achieves 4× size reduction with under 1% accuracy
|
||
drop. INT4 reaches 8× but costs 2–5%. Unstructured pruning
|
||
often yields no speedup at all. The frontier is real, and you cannot
|
||
move along it for free.
|
||
</p>
|
||
<div style="display: flex; gap: 12px; flex-wrap: wrap;">
|
||
<span style="background: rgba(99,102,241,0.15); color: #a5b4fc;
|
||
padding: 5px 14px; border-radius: 20px; font-size: 0.8rem;
|
||
font-weight: 600; border: 1px solid rgba(99,102,241,0.25);">
|
||
Act I: The Quantization Surprise · 12–15 min
|
||
</span>
|
||
<span style="background: rgba(204,85,0,0.15); color: #fdba74;
|
||
padding: 5px 14px; border-radius: 20px; font-size: 0.8rem;
|
||
font-weight: 600; border: 1px solid rgba(204,85,0,0.25);">
|
||
Act II: The Compression Frontier · 20–25 min
|
||
</span>
|
||
<span style="background: rgba(16,185,129,0.15); color: #6ee7b7;
|
||
padding: 5px 14px; border-radius: 20px; font-size: 0.8rem;
|
||
font-weight: 600; border: 1px solid rgba(16,185,129,0.25);">
|
||
35–40 min total
|
||
</span>
|
||
</div>
|
||
<div style="display: flex; gap: 12px; flex-wrap: wrap; margin-top: 12px;">
|
||
<span class="badge badge-info">First use: Compression Trade-off Frontier</span>
|
||
<span class="badge badge-warn">Memory budget failure state active</span>
|
||
</div>
|
||
</div>
|
||
"""),
|
||
])
|
||
return
|
||
|
||
|
||
# ── CELL 2: RECOMMENDED READING ───────────────────────────────────────────────
|
||
@app.cell(hide_code=True)
|
||
def _(mo):
|
||
mo.callout(mo.md("""
|
||
**Recommended Reading** — Complete the following before this lab:
|
||
|
||
- **@sec-optimizations-quantization** — Uniform quantization: the `Q(x) = round(x/scale + zero_point)` formula, scale calibration, PTQ vs QAT, hardware native support for INT8 vs INT4.
|
||
- **@sec-optimizations-pruning** — Unstructured vs structured pruning, the Lottery Ticket Hypothesis, why sparse weights do not automatically yield latency improvements on dense hardware.
|
||
- **@sec-optimizations-model-compression** — The accuracy-size Pareto frontier, compression ratio definition, why INT8 is the practical sweet spot for most deployments.
|
||
- **@sec-optimizations-knowledge-distillation** — Distillation as an alternative compression axis.
|
||
"""), kind="info")
|
||
return
|
||
|
||
|
||
# ── CELL 3: CONTEXT TOGGLE ────────────────────────────────────────────────────
|
||
@app.cell(hide_code=True)
|
||
def _(mo):
|
||
context_toggle = mo.ui.radio(
|
||
options={
|
||
"Cloud (H100 — 80 GB HBM, 700 W)": "cloud",
|
||
"Mobile (NPU — 8 GB, 5 W sustained)": "mobile",
|
||
},
|
||
value="Cloud (H100 — 80 GB HBM, 700 W)",
|
||
label="Deployment context:",
|
||
inline=True,
|
||
)
|
||
mo.vstack([
|
||
mo.md("---"),
|
||
mo.md("**Select your deployment context.** Hardware constraints differ by more than 10× across these two environments."),
|
||
context_toggle,
|
||
])
|
||
return (context_toggle,)
|
||
|
||
|
||
# ── CELL 4: CONTEXT SPEC CARD ─────────────────────────────────────────────────
|
||
@app.cell(hide_code=True)
|
||
def _(mo, context_toggle, COLORS):
|
||
_ctx = context_toggle.value
|
||
if _ctx == "cloud":
|
||
_accent = COLORS["Cloud"]
|
||
_bg = "#f0f4ff"
|
||
_border = "#c7d2fe"
|
||
_specs = [
|
||
("Device", "NVIDIA H100 SXM5"),
|
||
("HBM Capacity", "80 GB"),
|
||
("Memory Bandwidth", "3,350 GB/s"),
|
||
("FP16 Peak", "1,979 TFLOPS"),
|
||
("INT8 Peak", "3,958 TOPS (2x FP16)"),
|
||
("Power Budget", "700 W TDP"),
|
||
("INT8 native support", "Yes — Tensor Cores"),
|
||
("INT4 native support", "Yes — Tensor Cores"),
|
||
]
|
||
else:
|
||
_accent = COLORS["Mobile"]
|
||
_bg = "#fff7ed"
|
||
_border = "#fed7aa"
|
||
_specs = [
|
||
("Device", "Mobile NPU (Apple A17-class)"),
|
||
("RAM Capacity", "8 GB unified"),
|
||
("Memory Bandwidth", "68 GB/s"),
|
||
("INT8 Peak", "35 TOPS"),
|
||
("FP16 throughput", "~0.5x INT8 (software emulation path)"),
|
||
("Power Budget", "5 W sustained"),
|
||
("INT8 native support", "Yes — Neural Engine"),
|
||
("INT4 native support", "Partial — model-dependent"),
|
||
]
|
||
|
||
_rows = "".join(
|
||
f'<div style="display:flex; justify-content:space-between; padding:5px 0; '
|
||
f'border-bottom:1px solid {_border}; font-size:0.85rem;">'
|
||
f'<span style="color:#475569; font-weight:600;">{k}</span>'
|
||
f'<span style="font-family:monospace; color:{_accent}; font-weight:700;">{v}</span>'
|
||
f'</div>'
|
||
for k, v in _specs
|
||
)
|
||
|
||
mo.Html(f"""
|
||
<div style="background:{_bg}; border:1px solid {_border}; border-left:4px solid {_accent};
|
||
border-radius:8px; padding:16px 20px; margin: 8px 0;">
|
||
<div style="font-size:0.72rem; font-weight:700; color:{_accent}; text-transform:uppercase;
|
||
letter-spacing:0.1em; margin-bottom:10px;">
|
||
Active Context — Hardware Constraints
|
||
</div>
|
||
{_rows}
|
||
</div>
|
||
""")
|
||
return
|
||
|
||
|
||
# ═════════════════════════════════════════════════════════════════════════════
|
||
# ACT I — THE QUANTIZATION SURPRISE
|
||
# ═════════════════════════════════════════════════════════════════════════════
|
||
|
||
# ── CELL 5: ACT I SCENARIO ────────────────────────────────────────────────────
|
||
@app.cell(hide_code=True)
|
||
def _(mo, COLORS):
|
||
_color = COLORS["Mobile"]
|
||
mo.vstack([
|
||
mo.md("---"),
|
||
mo.Html(f"""
|
||
<div style="background: linear-gradient(90deg, #0f172a, #1e293b);
|
||
padding: 10px 20px; border-radius: 8px; margin: 8px 0;">
|
||
<span style="font-size:0.72rem; font-weight:700; color:#6366f1;
|
||
text-transform:uppercase; letter-spacing:0.15em;">
|
||
Act I · Calibration · 12–15 min
|
||
</span>
|
||
<span style="font-size:1.2rem; font-weight:800; color:#f8fafc; margin-left:16px;">
|
||
The Quantization Surprise
|
||
</span>
|
||
</div>
|
||
"""),
|
||
mo.Html(f"""
|
||
<div style="border-left:4px solid {_color}; background:#fff7ed;
|
||
border-radius:0 10px 10px 0; padding:16px 22px; margin:12px 0;">
|
||
<div style="font-size:0.72rem; font-weight:700; color:{_color};
|
||
text-transform:uppercase; letter-spacing:0.1em; margin-bottom:6px;">
|
||
Incoming Message · Mobile App Team Lead
|
||
</div>
|
||
<div style="font-style:italic; font-size:1.0rem; color:#1e293b; line-height:1.65;">
|
||
"We have a ResNet-50 running at 98 MB in FP32. Our App Store limit
|
||
is 25 MB for the on-device model. A colleague told me INT8 quantization
|
||
is mathematically lossless — it just changes the number format, so
|
||
accuracy is preserved. Is that true? Can we ship INT8 with zero quality
|
||
regression?"
|
||
</div>
|
||
</div>
|
||
"""),
|
||
mo.md("""
|
||
The team lead has heard that quantization is "lossless." Before you run the
|
||
instruments, commit to a prediction. The chapter established
|
||
(@sec-optimizations-quantization) that uniform quantization introduces a
|
||
rounding error bounded by half the step size. The question is how large that
|
||
error is in practice on a real model.
|
||
"""),
|
||
])
|
||
return
|
||
|
||
|
||
# ── CELL 6: ACT I PREDICTION LOCK ─────────────────────────────────────────────
|
||
@app.cell(hide_code=True)
|
||
def _(mo):
|
||
act1_prediction = mo.ui.radio(
|
||
options={
|
||
"A) Yes — INT8 is mathematically equivalent to FP32 for inference": "A",
|
||
"B) Under 0.1% accuracy drop — essentially lossless for practical purposes": "B",
|
||
"C) 0.5 to 2% accuracy drop — practically acceptable but not zero": "C",
|
||
"D) 5 to 10% accuracy drop — unacceptable for production use": "D",
|
||
},
|
||
label=(
|
||
"Applying INT8 post-training quantization (PTQ) to ResNet-50 (FP32 baseline: "
|
||
"76.1% ImageNet top-1). What accuracy change do you expect?"
|
||
),
|
||
)
|
||
mo.vstack([
|
||
mo.Html("""
|
||
<div style="background: #1e293b; border-radius: 12px; padding: 20px;
|
||
border-left: 4px solid #6366f1; margin: 8px 0;">
|
||
<div style="font-size: 0.72rem; font-weight: 700; color: #a5b4fc;
|
||
text-transform: uppercase; letter-spacing: 0.1em; margin-bottom: 10px;">
|
||
Prediction Lock — Act I
|
||
</div>
|
||
<div style="color: #e2e8f0; font-size: 0.88rem; margin-bottom: 12px;">
|
||
Commit before touching any controls. Your prediction will be
|
||
compared to the actual result at the end of this act.
|
||
</div>
|
||
</div>
|
||
"""),
|
||
act1_prediction,
|
||
])
|
||
return (act1_prediction,)
|
||
|
||
|
||
# ── CELL 7: ACT I GATE ────────────────────────────────────────────────────────
|
||
@app.cell(hide_code=True)
|
||
def _(mo, act1_prediction):
|
||
mo.stop(
|
||
act1_prediction.value is None,
|
||
mo.callout(
|
||
mo.md("Select your prediction above to unlock the quantization instrument."),
|
||
kind="warn",
|
||
),
|
||
)
|
||
return
|
||
|
||
|
||
# ── CELL 8: ACT I CONTROLS ────────────────────────────────────────────────────
|
||
@app.cell(hide_code=True)
|
||
def _(mo):
|
||
act1_model = mo.ui.dropdown(
|
||
options={
|
||
"ResNet-50 (25.6 M params, 98 MB FP32)": "resnet50",
|
||
"MobileNetV3-Large (5.4 M params, 21 MB FP32)": "mobilenetv3",
|
||
"ViT-Base/16 (86 M params, 330 MB FP32)": "vitbase",
|
||
},
|
||
value="ResNet-50 (25.6 M params, 98 MB FP32)",
|
||
label="Model architecture",
|
||
)
|
||
act1_quant_scheme = mo.ui.dropdown(
|
||
options={
|
||
"PTQ — Post-Training Quantization (no retraining)": "ptq",
|
||
"QAT — Quantization-Aware Training (fine-tuned)": "qat",
|
||
},
|
||
value="PTQ — Post-Training Quantization (no retraining)",
|
||
label="Quantization scheme",
|
||
)
|
||
act1_calib_size = mo.ui.slider(
|
||
start=128, stop=4096, value=512, step=128,
|
||
label="Calibration dataset size (PTQ only)",
|
||
)
|
||
mo.vstack([
|
||
mo.md("### Quantization Impact Table — Controls"),
|
||
mo.hstack([act1_model, act1_quant_scheme], justify="start", gap="2rem"),
|
||
mo.hstack([act1_calib_size], justify="start"),
|
||
mo.callout(mo.md(
|
||
"**PTQ vs QAT:** Post-training quantization requires only a calibration "
|
||
"dataset (no gradient computation). QAT fine-tunes with simulated quantization "
|
||
"and recovers 0.2–0.5% additional accuracy, but requires GPU training time."
|
||
), kind="info"),
|
||
])
|
||
return (act1_model, act1_quant_scheme, act1_calib_size)
|
||
|
||
|
||
# ── CELL 9: ACT I PHYSICS ENGINE + QUANTIZATION TABLE ─────────────────────────
|
||
@app.cell(hide_code=True)
|
||
def _(
|
||
mo,
|
||
act1_model, act1_quant_scheme, act1_calib_size,
|
||
context_toggle,
|
||
RESNET50_PARAMS_M, RESNET50_FP32_MB, RESNET50_TOP1_ACC,
|
||
MOBILENETV3_PARAMS_M, MOBILENETV3_FP32_MB, MOBILENETV3_TOP1_ACC,
|
||
VITBASE_PARAMS_M, VITBASE_FP32_MB, VITBASE_TOP1_ACC,
|
||
H100_TDP_W, MOBILE_TDP_W,
|
||
DTYPE_BYTES,
|
||
):
|
||
# ── Model lookup ──────────────────────────────────────────────────────────
|
||
_model_key = act1_model.value
|
||
_scheme = act1_quant_scheme.value
|
||
_calib = act1_calib_size.value
|
||
_ctx = context_toggle.value
|
||
|
||
_MODEL_SPECS = {
|
||
"resnet50": {"params_m": RESNET50_PARAMS_M, "fp32_mb": RESNET50_FP32_MB, "base_acc": RESNET50_TOP1_ACC},
|
||
"mobilenetv3": {"params_m": MOBILENETV3_PARAMS_M, "fp32_mb": MOBILENETV3_FP32_MB, "base_acc": MOBILENETV3_TOP1_ACC},
|
||
"vitbase": {"params_m": VITBASE_PARAMS_M, "fp32_mb": VITBASE_FP32_MB, "base_acc": VITBASE_TOP1_ACC},
|
||
}
|
||
_spec = _MODEL_SPECS[_model_key]
|
||
_base_acc = _spec["base_acc"]
|
||
_fp32_mb = _spec["fp32_mb"]
|
||
|
||
# ── Calibration quality multiplier ────────────────────────────────────────
|
||
# Source: @sec-optimizations-quantization — larger calibration sets reduce
|
||
# the range estimation error. Effect saturates beyond ~1024 samples.
|
||
# Small calib (<256): adds ~0.2% extra accuracy penalty; large (>2048): minimal effect.
|
||
_calib_penalty = 0.0
|
||
if _scheme == "ptq":
|
||
if _calib < 256:
|
||
_calib_penalty = 0.25
|
||
elif _calib < 512:
|
||
_calib_penalty = 0.10
|
||
else:
|
||
_calib_penalty = 0.0
|
||
|
||
# ── QAT recovery bonus ────────────────────────────────────────────────────
|
||
# Source: @sec-optimizations-quantization — QAT trains with simulated quantization
|
||
# noise, recovering 0.2–0.5% accuracy compared to PTQ at the same bit-width.
|
||
_qat_recovery = 0.35 if _scheme == "qat" else 0.0
|
||
|
||
# ── Accuracy drop model ───────────────────────────────────────────────────
|
||
# Source: @sec-optimizations-model-compression empirical figures:
|
||
# FP32 -> FP16: <0.05% (rounding only, 8-bit exponent preserved)
|
||
# FP16 -> INT8: 0.3–0.7% PTQ (linear range mapping loses outlier precision)
|
||
# INT8 -> INT4: 1.5–3.5% PTQ (4x more quantization bins lost)
|
||
# INT4 -> INT2: 4–8% PTQ (severe representational collapse)
|
||
# ViT is more sensitive to quantization than ResNets due to attention softmax.
|
||
_SENSITIVITY = {
|
||
"resnet50": 1.0,
|
||
"mobilenetv3": 0.85,
|
||
"vitbase": 1.35,
|
||
}
|
||
_s = _SENSITIVITY[_model_key]
|
||
|
||
_ACC_DROP_PTQ = {
|
||
"fp32": 0.00,
|
||
"fp16": 0.04 * _s,
|
||
"int8": 0.50 * _s + _calib_penalty,
|
||
"int4": 2.60 * _s + _calib_penalty * 1.5,
|
||
"int2": 6.80 * _s + _calib_penalty * 2.0,
|
||
}
|
||
|
||
_ACC_DROP = {
|
||
k: max(0.0, v - _qat_recovery * (v / (_ACC_DROP_PTQ["int4"] + 0.001)))
|
||
for k, v in _ACC_DROP_PTQ.items()
|
||
}
|
||
|
||
# ── Memory size by format ──────────────────────────────────────────────────
|
||
# Source: @sec-optimizations-quantization
|
||
# compression ratio = FP32_bytes / target_bytes
|
||
_MEM_MB = {
|
||
fmt: _fp32_mb * (bpv / 4.0)
|
||
for fmt, bpv in DTYPE_BYTES.items()
|
||
}
|
||
|
||
# ── Latency model ──────────────────────────────────────────────────────────
|
||
# Source: @sec-optimizations-model-compression — memory-bandwidth bound inference.
|
||
# Latency ≈ model_size_bytes / memory_bandwidth × 1000 (ms)
|
||
# Cloud: H100 3350 GB/s, natively supports INT8/INT4 at 2× throughput.
|
||
# Mobile: NPU 68 GB/s, FP16 uses software fallback path (~2× slower than INT8).
|
||
_CLOUD_LATENCY_FACTOR = {
|
||
"fp32": 1.0,
|
||
"fp16": 0.50,
|
||
"int8": 0.25,
|
||
"int4": 0.15,
|
||
"int2": 0.12,
|
||
}
|
||
_MOBILE_LATENCY_FACTOR = {
|
||
"fp32": 4.0,
|
||
"fp16": 1.80,
|
||
"int8": 1.00,
|
||
"int4": 0.65,
|
||
"int2": 0.90,
|
||
}
|
||
|
||
# Absolute latency calibration (ms, batch=1)
|
||
_CLOUD_FP32_BASE_MS = 1.2 if _model_key == "resnet50" else (0.3 if _model_key == "mobilenetv3" else 4.8)
|
||
_MOBILE_INT8_BASE_MS = 4.5 if _model_key == "resnet50" else (1.1 if _model_key == "mobilenetv3" else 18.0)
|
||
|
||
if _ctx == "cloud":
|
||
_LAT_BASE = _CLOUD_FP32_BASE_MS
|
||
_LAT_FACTOR = _CLOUD_LATENCY_FACTOR
|
||
else:
|
||
_LAT_BASE = _MOBILE_INT8_BASE_MS / _MOBILE_LATENCY_FACTOR["int8"]
|
||
_LAT_FACTOR = _MOBILE_LATENCY_FACTOR
|
||
|
||
_POWER_W = H100_TDP_W if _ctx == "cloud" else MOBILE_TDP_W
|
||
|
||
# ── Build per-format rows ─────────────────────────────────────────────────
|
||
_formats = ["fp32", "fp16", "int8", "int4", "int2"]
|
||
_fmt_labels = {"fp32": "FP32", "fp16": "FP16", "int8": "INT8", "int4": "INT4", "int2": "INT2"}
|
||
|
||
_rows = []
|
||
for _fmt in _formats:
|
||
_acc_val = _base_acc - _ACC_DROP[_fmt]
|
||
_drop_val = _ACC_DROP[_fmt]
|
||
_mem_val = _MEM_MB[_fmt]
|
||
_lat_val = _LAT_BASE * _LAT_FACTOR[_fmt]
|
||
_energy_mj = _lat_val * _POWER_W / 1000.0
|
||
_cr = _fp32_mb / _mem_val
|
||
|
||
if _drop_val < 0.15:
|
||
_acc_color = "#008F45"
|
||
elif _drop_val < 1.0:
|
||
_acc_color = "#CC5500"
|
||
else:
|
||
_acc_color = "#CB202D"
|
||
|
||
_rows.append({
|
||
"fmt": _fmt,
|
||
"label": _fmt_labels[_fmt],
|
||
"accuracy": _acc_val,
|
||
"drop": _drop_val,
|
||
"acc_color": _acc_color,
|
||
"mem_mb": _mem_val,
|
||
"cr": _cr,
|
||
"lat_ms": _lat_val,
|
||
"energy_mj": _energy_mj,
|
||
})
|
||
|
||
# ── HTML table ────────────────────────────────────────────────────────────
|
||
_HEADER_STYLE = (
|
||
"background:#1e293b; color:#94a3b8; font-size:0.72rem; font-weight:700; "
|
||
"text-transform:uppercase; letter-spacing:0.08em; padding:8px 12px; "
|
||
"text-align:right; white-space:nowrap;"
|
||
)
|
||
_CELL_STYLE = "padding:8px 12px; text-align:right; font-size:0.88rem; font-family:monospace;"
|
||
|
||
_table_rows_html = ""
|
||
for _r in _rows:
|
||
_is_int8 = _r["fmt"] == "int8"
|
||
_bg_row = "background:#f0fdf4;" if _is_int8 else ""
|
||
_drop_color = "#CB202D" if _r["drop"] > 1.5 else ("#CC5500" if _r["drop"] > 0.3 else "#008F45")
|
||
_table_rows_html += (
|
||
f'<tr style="{_bg_row}border-bottom:1px solid #e2e8f0;">'
|
||
f'<td style="padding:8px 12px; font-weight:800; font-size:0.88rem; color:#0f172a;">'
|
||
f'{_r["label"]}</td>'
|
||
f'<td style="{_CELL_STYLE} color:{_r["acc_color"]}; font-weight:700;">'
|
||
f'{_r["accuracy"]:.2f}%</td>'
|
||
f'<td style="{_CELL_STYLE} color:{_drop_color}; font-weight:700;">'
|
||
f'-{_r["drop"]:.2f}%</td>'
|
||
f'<td style="{_CELL_STYLE} color:#006395; font-weight:700;">'
|
||
f'{_r["mem_mb"]:.1f} MB</td>'
|
||
f'<td style="{_CELL_STYLE} color:#475569;">'
|
||
f'{_r["cr"]:.1f}x</td>'
|
||
f'<td style="{_CELL_STYLE} color:#475569;">'
|
||
f'{_r["lat_ms"]:.2f} ms</td>'
|
||
f'<td style="{_CELL_STYLE} color:#475569;">'
|
||
f'{_r["energy_mj"]:.3f} mJ</td>'
|
||
f'</tr>'
|
||
)
|
||
|
||
_ctx_label = "Cloud (H100)" if _ctx == "cloud" else "Mobile (NPU)"
|
||
_scheme_label = "PTQ" if _scheme == "ptq" else "QAT"
|
||
_calib_note = f" · Calib: {_calib} samples" if _scheme == "ptq" else ""
|
||
|
||
mo.Html(f"""
|
||
<div style="margin: 16px 0;">
|
||
<div style="font-size:0.72rem; font-weight:700; color:#475569;
|
||
text-transform:uppercase; letter-spacing:0.1em; margin-bottom:8px;">
|
||
Quantization Impact Table — {_model_key.upper()}
|
||
· {_scheme_label} · {_ctx_label}{_calib_note}
|
||
</div>
|
||
<div style="overflow-x:auto; border-radius:12px; border:1px solid #e2e8f0;
|
||
box-shadow:0 2px 8px rgba(0,0,0,0.04);">
|
||
<table style="width:100%; border-collapse:collapse; min-width:600px;">
|
||
<thead>
|
||
<tr>
|
||
<th style="{_HEADER_STYLE} text-align:left;">Format</th>
|
||
<th style="{_HEADER_STYLE}">Accuracy (Top-1)</th>
|
||
<th style="{_HEADER_STYLE}">Accuracy Drop</th>
|
||
<th style="{_HEADER_STYLE}">Model Size</th>
|
||
<th style="{_HEADER_STYLE}">Compression</th>
|
||
<th style="{_HEADER_STYLE}">Inference Latency</th>
|
||
<th style="{_HEADER_STYLE}">Energy / Inference</th>
|
||
</tr>
|
||
</thead>
|
||
<tbody>
|
||
{_table_rows_html}
|
||
</tbody>
|
||
</table>
|
||
</div>
|
||
<div style="margin-top:8px; font-size:0.78rem; color:#94a3b8; line-height:1.5;">
|
||
INT8 highlighted — practical sweet spot: 4x compression,
|
||
under 1% accuracy penalty on PTQ with adequate calibration.
|
||
</div>
|
||
</div>
|
||
""")
|
||
return
|
||
|
||
|
||
# ── CELL 10: ACT I ACCURACY/SIZE CHART ────────────────────────────────────────
|
||
@app.cell(hide_code=True)
|
||
def _(
|
||
mo, go, make_subplots, COLORS,
|
||
act1_model, act1_quant_scheme, act1_calib_size,
|
||
RESNET50_FP32_MB, RESNET50_TOP1_ACC,
|
||
MOBILENETV3_FP32_MB, MOBILENETV3_TOP1_ACC,
|
||
VITBASE_FP32_MB, VITBASE_TOP1_ACC,
|
||
DTYPE_BYTES,
|
||
):
|
||
# ── Replicate accuracy-drop model (same physics as cell 9) ───────────────
|
||
_model_key = act1_model.value
|
||
_scheme = act1_quant_scheme.value
|
||
_calib = act1_calib_size.value
|
||
|
||
_MODEL_LOOKUP = {
|
||
"resnet50": {"fp32_mb": RESNET50_FP32_MB, "base_acc": RESNET50_TOP1_ACC},
|
||
"mobilenetv3": {"fp32_mb": MOBILENETV3_FP32_MB, "base_acc": MOBILENETV3_TOP1_ACC},
|
||
"vitbase": {"fp32_mb": VITBASE_FP32_MB, "base_acc": VITBASE_TOP1_ACC},
|
||
}
|
||
_spec2 = _MODEL_LOOKUP[_model_key]
|
||
_base_acc = _spec2["base_acc"]
|
||
_fp32_mb = _spec2["fp32_mb"]
|
||
|
||
_calib_penalty = 0.25 if (_scheme == "ptq" and _calib < 256) else (0.10 if (_scheme == "ptq" and _calib < 512) else 0.0)
|
||
_qat_recovery = 0.35 if _scheme == "qat" else 0.0
|
||
_SENS2 = {"resnet50": 1.0, "mobilenetv3": 0.85, "vitbase": 1.35}
|
||
_s2 = _SENS2[_model_key]
|
||
|
||
_ACC_DROP2_PTQ = {
|
||
"fp32": 0.00,
|
||
"fp16": 0.04 * _s2,
|
||
"int8": 0.50 * _s2 + _calib_penalty,
|
||
"int4": 2.60 * _s2 + _calib_penalty * 1.5,
|
||
"int2": 6.80 * _s2 + _calib_penalty * 2.0,
|
||
}
|
||
_ACC_DROP2 = {
|
||
k: max(0.0, v - _qat_recovery * (v / (_ACC_DROP2_PTQ["int4"] + 0.001)))
|
||
for k, v in _ACC_DROP2_PTQ.items()
|
||
}
|
||
|
||
_formats = ["fp32", "fp16", "int8", "int4", "int2"]
|
||
_fmt_labels = ["FP32", "FP16", "INT8", "INT4", "INT2"]
|
||
_acc_vals = [_base_acc - _ACC_DROP2[f] for f in _formats]
|
||
_mem_mb = [_fp32_mb * (DTYPE_BYTES[f] / 4.0) for f in _formats]
|
||
_bar_colors = [
|
||
COLORS["BlueLine"] if _ACC_DROP2[f] < 0.15
|
||
else COLORS["OrangeLine"] if _ACC_DROP2[f] < 1.0
|
||
else COLORS["RedLine"]
|
||
for f in _formats
|
||
]
|
||
|
||
_fig_a1 = make_subplots(
|
||
rows=1, cols=2,
|
||
subplot_titles=("Accuracy by Format (Top-1 %)", "Model Size by Format (MB)"),
|
||
horizontal_spacing=0.12,
|
||
)
|
||
|
||
_fig_a1.add_trace(
|
||
go.Bar(
|
||
name="Accuracy", x=_fmt_labels, y=_acc_vals,
|
||
marker_color=_bar_colors,
|
||
text=[f"{v:.2f}%" for v in _acc_vals],
|
||
textposition="outside",
|
||
textfont=dict(size=11, family="SF Mono, monospace"),
|
||
),
|
||
row=1, col=1,
|
||
)
|
||
_fig_a1.add_hline(
|
||
y=_base_acc, row=1, col=1,
|
||
line_color=COLORS["GreenLine"], line_dash="dash", line_width=1.5,
|
||
annotation_text="FP32 baseline",
|
||
annotation_font_color=COLORS["GreenLine"],
|
||
annotation_position="right",
|
||
)
|
||
_fig_a1.add_trace(
|
||
go.Bar(
|
||
name="Size (MB)", x=_fmt_labels, y=_mem_mb,
|
||
marker_color=[COLORS["BlueLine"]] * len(_formats),
|
||
text=[f"{v:.1f} MB" for v in _mem_mb],
|
||
textposition="outside",
|
||
textfont=dict(size=11, family="SF Mono, monospace"),
|
||
showlegend=False,
|
||
),
|
||
row=1, col=2,
|
||
)
|
||
_fig_a1.add_hline(
|
||
y=25, row=1, col=2,
|
||
line_color=COLORS["OrangeLine"], line_dash="dot", line_width=2,
|
||
annotation_text="25 MB App Store target",
|
||
annotation_font_color=COLORS["OrangeLine"],
|
||
annotation_position="right",
|
||
)
|
||
_fig_a1.update_layout(
|
||
height=380, showlegend=False,
|
||
plot_bgcolor="white", paper_bgcolor="white",
|
||
font_family="Inter, sans-serif",
|
||
margin=dict(l=40, r=140, t=50, b=40),
|
||
)
|
||
_fig_a1.update_yaxes(gridcolor="#f1f5f9", row=1, col=1)
|
||
_fig_a1.update_yaxes(gridcolor="#f1f5f9", row=1, col=2)
|
||
_fig_a1.update_xaxes(linecolor=COLORS["Border"])
|
||
|
||
mo.vstack([
|
||
mo.md("### Accuracy and Size Trade-off by Format"),
|
||
mo.plotly(_fig_a1),
|
||
])
|
||
return
|
||
|
||
|
||
# ── CELL 11: ACT I MATHPEEK ───────────────────────────────────────────────────
|
||
@app.cell(hide_code=True)
|
||
def _(mo):
|
||
mo.accordion({
|
||
"The governing equation — Uniform Quantization": mo.md("""
|
||
**Uniform Quantization Formula** (@sec-optimizations-quantization):
|
||
|
||
```
|
||
Q(x) = round(x / scale + zero_point)
|
||
```
|
||
|
||
**Scale calculation (symmetric, per-tensor):**
|
||
|
||
```
|
||
scale = max(|x|) / (2^(bits-1) - 1)
|
||
```
|
||
|
||
For INT8 (bits=8): `scale = max(|x|) / 127`
|
||
|
||
**Quantization error bound:**
|
||
|
||
```
|
||
|epsilon_Q| <= scale / 2 = max(|x|) / (2 x (2^(bits-1) - 1))
|
||
```
|
||
|
||
**Why INT8 outperforms INT4:**
|
||
|
||
- INT8: 256 discrete levels — max relative error = 0.39% of range
|
||
- INT4: 16 discrete levels — max relative error = 6.25% of range
|
||
- Ratio: 16x more quantization error at INT4 vs INT8
|
||
|
||
**Compression ratio:**
|
||
|
||
```
|
||
CR = FP32_size / target_size = 4 bytes / target_bytes_per_value
|
||
```
|
||
|
||
INT8: CR = 4/1 = **4x** | INT4: CR = 4/0.5 = **8x** | INT2: CR = 4/0.25 = **16x**
|
||
"""),
|
||
})
|
||
return
|
||
|
||
|
||
# ── CELL 12: ACT I PREDICTION-VS-REALITY REVEAL ───────────────────────────────
|
||
@app.cell(hide_code=True)
|
||
def _(
|
||
mo, act1_prediction,
|
||
act1_model, act1_quant_scheme, act1_calib_size,
|
||
RESNET50_TOP1_ACC, MOBILENETV3_TOP1_ACC, VITBASE_TOP1_ACC,
|
||
):
|
||
_model_key = act1_model.value
|
||
_scheme = act1_quant_scheme.value
|
||
_calib = act1_calib_size.value
|
||
|
||
_ACC_LOOKUP = {
|
||
"resnet50": RESNET50_TOP1_ACC,
|
||
"mobilenetv3": MOBILENETV3_TOP1_ACC,
|
||
"vitbase": VITBASE_TOP1_ACC,
|
||
}
|
||
_base_acc = _ACC_LOOKUP[_model_key]
|
||
_calib_penalty = 0.25 if (_scheme == "ptq" and _calib < 256) else (0.10 if (_scheme == "ptq" and _calib < 512) else 0.0)
|
||
_qat_recovery = 0.35 if _scheme == "qat" else 0.0
|
||
_SENS3 = {"resnet50": 1.0, "mobilenetv3": 0.85, "vitbase": 1.35}
|
||
_s3 = _SENS3[_model_key]
|
||
|
||
_int8_drop = max(0.0, 0.50 * _s3 + _calib_penalty - _qat_recovery * 0.35)
|
||
|
||
_pred_val = act1_prediction.value
|
||
_PRED_BANDS = {
|
||
"A": (0.0, 0.0),
|
||
"B": (0.0, 0.1),
|
||
"C": (0.5, 2.0),
|
||
"D": (5.0, 10.0),
|
||
}
|
||
_lo, _hi = _PRED_BANDS[_pred_val]
|
||
_correct = _lo <= _int8_drop <= _hi
|
||
|
||
_FEEDBACK = {
|
||
"A": (
|
||
f"**Not quite.** INT8 is not mathematically equivalent to FP32. "
|
||
f"Quantization maps each floating-point weight to one of 256 discrete integer levels "
|
||
f"using `Q(x) = round(x/scale + zero_point)`. Every rounding is a real error. "
|
||
f"For {_model_key.upper()}, INT8 PTQ costs **{_int8_drop:.2f}% accuracy** — "
|
||
f"small, but nonzero and measurable."
|
||
),
|
||
"B": (
|
||
f"**Close, but the data disagrees.** INT8 is not lossless — it introduces "
|
||
f"rounding error bounded by `scale/2`. For ResNet-50, the INT8 drop is "
|
||
f"**{_int8_drop:.2f}%**, which is above the 0.1% threshold. "
|
||
f"QAT can bring it close to 0.1%, but standard PTQ will not."
|
||
),
|
||
"C": (
|
||
f"**Correct.** INT8 PTQ introduces a measurable but practically acceptable "
|
||
f"accuracy penalty. For {_model_key.upper()}, the actual drop is "
|
||
f"**{_int8_drop:.2f}%**. This falls squarely in the 0.5–2% range for PTQ with "
|
||
f"adequate calibration. The team lead was wrong that INT8 is lossless, "
|
||
f"but right that it is usable in production."
|
||
),
|
||
"D": (
|
||
f"**Not quite.** A 5–10% drop would make INT8 unusable, but that level of "
|
||
f"degradation is characteristic of INT2 or very aggressive INT4, not INT8. "
|
||
f"For {_model_key.upper()}, INT8 PTQ costs only **{_int8_drop:.2f}%** accuracy — "
|
||
f"enough to notice in A/B testing, but not enough to block deployment."
|
||
),
|
||
}
|
||
|
||
mo.callout(
|
||
mo.md(_FEEDBACK[_pred_val]),
|
||
kind="success" if _correct else "warn",
|
||
)
|
||
return
|
||
|
||
|
||
# ── CELL 13: ACT I REFLECTION ─────────────────────────────────────────────────
|
||
@app.cell(hide_code=True)
|
||
def _(mo):
|
||
act1_reflection = mo.ui.radio(
|
||
options={
|
||
"A) INT8 has more representable values — less rounding error in the linear mapping": "A",
|
||
"B) INT8 is only applied to weights, not activations, so errors cancel": "B",
|
||
"C) INT4 always uses non-uniform quantization which amplifies error": "C",
|
||
"D) INT8 activations are always exactly representable in hardware": "D",
|
||
},
|
||
label="Reflection: Why does INT8 preserve accuracy better than INT4 in uniform quantization?",
|
||
)
|
||
mo.vstack([
|
||
mo.md("---"),
|
||
mo.md("### Reflection — Act I"),
|
||
act1_reflection,
|
||
])
|
||
return (act1_reflection,)
|
||
|
||
|
||
# ── CELL 14: ACT I REFLECTION FEEDBACK ───────────────────────────────────────
|
||
@app.cell(hide_code=True)
|
||
def _(mo, act1_reflection):
|
||
mo.stop(
|
||
act1_reflection.value is None,
|
||
mo.callout(mo.md("Select an answer to see the explanation."), kind="warn"),
|
||
)
|
||
|
||
_REFL1 = {
|
||
"A": (
|
||
"**Correct.** INT8 has 256 representable levels; INT4 has only 16. "
|
||
"The quantization scale factor `scale = max(|x|) / (2^(bits-1) - 1)` "
|
||
"is 16x larger for INT4 than INT8 when the value range is the same. "
|
||
"Each rounding error is up to 16x larger, and those errors accumulate "
|
||
"through layers. The accuracy gap between INT8 and INT4 is fundamentally "
|
||
"an information-capacity gap.",
|
||
True,
|
||
),
|
||
"B": (
|
||
"**Not correct.** Modern quantization (PTQ and QAT) applies to both "
|
||
"weights and activations. Quantizing only weights would reduce memory "
|
||
"footprint but leave inference arithmetic in FP32, missing the full "
|
||
"latency benefit. The accuracy penalty comes from both domains.",
|
||
False,
|
||
),
|
||
"C": (
|
||
"**Not correct.** Standard INT4 quantization uses uniform mapping, just "
|
||
"like INT8. Non-uniform quantization (e.g., NF4 used in QLoRA) actually "
|
||
"improves accuracy by placing more bins near zero where values cluster. "
|
||
"The INT4 accuracy penalty is a direct consequence of having 16 levels "
|
||
"vs 256 — not a property of uniform vs non-uniform mapping.",
|
||
False,
|
||
),
|
||
"D": (
|
||
"**Not correct.** INT8 activations are not exactly representable — "
|
||
"they are the output of the same rounding that weights undergo. "
|
||
"An activation of 0.732 mapped to scale=0.006 becomes round(0.732/0.006) = 122, "
|
||
"which dequantizes to 0.732 plus-or-minus 0.003. The error exists; it is "
|
||
"bounded by `scale/2`.",
|
||
False,
|
||
),
|
||
}
|
||
|
||
_text, _correct = _REFL1[act1_reflection.value]
|
||
mo.callout(mo.md(_text), kind="success" if _correct else "warn")
|
||
return
|
||
|
||
|
||
# ═════════════════════════════════════════════════════════════════════════════
|
||
# ACT II — THE COMPRESSION TRADE-OFF FRONTIER
|
||
# ═════════════════════════════════════════════════════════════════════════════
|
||
|
||
# ── CELL 15: ACT II SCENARIO ──────────────────────────────────────────────────
|
||
@app.cell(hide_code=True)
|
||
def _(mo, COLORS):
|
||
_color = COLORS["Cloud"]
|
||
mo.vstack([
|
||
mo.md("---"),
|
||
mo.Html(f"""
|
||
<div style="background: linear-gradient(90deg, #0f172a, #1e293b);
|
||
padding: 10px 20px; border-radius: 8px; margin: 8px 0;">
|
||
<span style="font-size:0.72rem; font-weight:700; color:{_color};
|
||
text-transform:uppercase; letter-spacing:0.15em;">
|
||
Act II · Design Challenge · 20–25 min
|
||
</span>
|
||
<span style="font-size:1.2rem; font-weight:800; color:#f8fafc; margin-left:16px;">
|
||
The Compression Trade-off Frontier
|
||
</span>
|
||
</div>
|
||
"""),
|
||
mo.Html(f"""
|
||
<div style="border-left:4px solid {_color}; background:#f0f4ff;
|
||
border-radius:0 10px 10px 0; padding:16px 22px; margin:12px 0;">
|
||
<div style="font-size:0.72rem; font-weight:700; color:{_color};
|
||
text-transform:uppercase; letter-spacing:0.1em; margin-bottom:6px;">
|
||
Incoming Message · Platform Engineering Lead
|
||
</div>
|
||
<div style="font-style:italic; font-size:1.0rem; color:#1e293b; line-height:1.65;">
|
||
"We are deploying LLaMA-3 8B as an on-device model across three mobile
|
||
tiers in our user base: Flagship (8 GB RAM), Mid-range (4 GB), and
|
||
Budget (2 GB). I need a different compression strategy for each tier.
|
||
FP32 is 32 GB — none of them can fit that. Design the compression stack
|
||
that keeps each tier as close to FP32 quality as possible within its
|
||
memory budget."
|
||
</div>
|
||
</div>
|
||
"""),
|
||
mo.Html("""
|
||
<div style="display:grid; grid-template-columns:repeat(3, 1fr); gap:14px; margin:16px 0;">
|
||
<div style="background:#f0fdf4; border:1px solid #bbf7d0; border-top:4px solid #008F45;
|
||
border-radius:8px; padding:14px;">
|
||
<div style="font-weight:800; color:#14532d; font-size:0.9rem; margin-bottom:4px;">
|
||
Flagship Tier
|
||
</div>
|
||
<div style="font-family:monospace; font-size:1.4rem; font-weight:900; color:#008F45;">
|
||
8 GB
|
||
</div>
|
||
<div style="font-size:0.8rem; color:#166534; margin-top:4px;">
|
||
Available RAM budget
|
||
</div>
|
||
</div>
|
||
<div style="background:#fff7ed; border:1px solid #fed7aa; border-top:4px solid #CC5500;
|
||
border-radius:8px; padding:14px;">
|
||
<div style="font-weight:800; color:#9a3412; font-size:0.9rem; margin-bottom:4px;">
|
||
Mid-range Tier
|
||
</div>
|
||
<div style="font-family:monospace; font-size:1.4rem; font-weight:900; color:#CC5500;">
|
||
4 GB
|
||
</div>
|
||
<div style="font-size:0.8rem; color:#7c2d12; margin-top:4px;">
|
||
Available RAM budget
|
||
</div>
|
||
</div>
|
||
<div style="background:#fef2f2; border:1px solid #fecaca; border-top:4px solid #CB202D;
|
||
border-radius:8px; padding:14px;">
|
||
<div style="font-weight:800; color:#991b1b; font-size:0.9rem; margin-bottom:4px;">
|
||
Budget Tier
|
||
</div>
|
||
<div style="font-family:monospace; font-size:1.4rem; font-weight:900; color:#CB202D;">
|
||
2 GB
|
||
</div>
|
||
<div style="font-size:0.8rem; color:#7f1d1d; margin-top:4px;">
|
||
Available RAM budget
|
||
</div>
|
||
</div>
|
||
</div>
|
||
"""),
|
||
mo.callout(mo.md("""
|
||
**First introduction: Compression Trade-off Frontier** — This instrument
|
||
(@sec-optimizations-model-compression) plots every compression configuration
|
||
as a point in (model size, quality) space, then highlights the Pareto frontier:
|
||
the set of configurations where you cannot improve quality without increasing size,
|
||
or reduce size without hurting quality. Your goal is to select the Pareto-optimal
|
||
configuration for each deployment tier.
|
||
"""), kind="info"),
|
||
])
|
||
return
|
||
|
||
|
||
# ── CELL 16: ACT II PREDICTION LOCK ───────────────────────────────────────────
|
||
@app.cell(hide_code=True)
|
||
def _(mo):
|
||
act2_prediction = mo.ui.radio(
|
||
options={
|
||
"A) Use INT8 for all three tiers — INT8 is the universal safe choice": "A",
|
||
"B) Flagship: INT8, Mid-range: INT4, Budget: INT4 + 50% structured pruning": "B",
|
||
"C) 50% unstructured pruning for all tiers — pruning is always better than quantization": "C",
|
||
"D) Distill a separate small model for budget tier — quantization never works below 4 bits": "D",
|
||
},
|
||
label=(
|
||
"LLaMA-3 8B, FP32 = 32 GB. Design the compression strategy for Flagship (8 GB), "
|
||
"Mid-range (4 GB), and Budget (2 GB) tiers that maximizes quality within each budget."
|
||
),
|
||
)
|
||
mo.vstack([
|
||
mo.Html("""
|
||
<div style="background: #1e293b; border-radius: 12px; padding: 20px;
|
||
border-left: 4px solid #6366f1; margin: 8px 0;">
|
||
<div style="font-size: 0.72rem; font-weight: 700; color: #a5b4fc;
|
||
text-transform: uppercase; letter-spacing: 0.1em; margin-bottom: 10px;">
|
||
Prediction Lock — Act II
|
||
</div>
|
||
<div style="color: #e2e8f0; font-size: 0.88rem; margin-bottom: 12px;">
|
||
Commit your strategy prediction before exploring the Frontier.
|
||
</div>
|
||
</div>
|
||
"""),
|
||
act2_prediction,
|
||
])
|
||
return (act2_prediction,)
|
||
|
||
|
||
# ── CELL 17: ACT II GATE ──────────────────────────────────────────────────────
|
||
@app.cell(hide_code=True)
|
||
def _(mo, act2_prediction):
|
||
mo.stop(
|
||
act2_prediction.value is None,
|
||
mo.callout(
|
||
mo.md("Select your strategy prediction above to unlock the Compression Frontier."),
|
||
kind="warn",
|
||
),
|
||
)
|
||
return
|
||
|
||
|
||
# ── CELL 18: ACT II CONTROLS ──────────────────────────────────────────────────
|
||
@app.cell(hide_code=True)
|
||
def _(mo):
|
||
_COMPRESS_OPTIONS = {
|
||
"FP32 (32 GB — 1x compression)": "fp32",
|
||
"FP16 (16 GB — 2x compression)": "fp16",
|
||
"INT8 (8 GB — 4x compression)": "int8",
|
||
"INT4 (4 GB — 8x compression)": "int4",
|
||
"INT4 + 10% structured pruning (~3.6 GB)": "int4_prune10",
|
||
"INT4 + 30% structured pruning (~2.8 GB)": "int4_prune30",
|
||
"INT4 + 50% structured pruning (~2.0 GB)": "int4_prune50",
|
||
"INT4 + 50% unstructured pruning (~2.0 GB)": "int4_prune50_unstruct",
|
||
"Distilled 4B model, INT8 (~4 GB)": "distil_4b_int8",
|
||
"Distilled 1B model, INT8 (~1 GB)": "distil_1b_int8",
|
||
}
|
||
|
||
act2_flagship = mo.ui.dropdown(
|
||
options=_COMPRESS_OPTIONS,
|
||
value="INT8 (8 GB — 4x compression)",
|
||
label="Flagship tier (8 GB budget)",
|
||
)
|
||
act2_midrange = mo.ui.dropdown(
|
||
options=_COMPRESS_OPTIONS,
|
||
value="INT4 (4 GB — 8x compression)",
|
||
label="Mid-range tier (4 GB budget)",
|
||
)
|
||
act2_budget = mo.ui.dropdown(
|
||
options=_COMPRESS_OPTIONS,
|
||
value="INT4 + 50% structured pruning (~2.0 GB)",
|
||
label="Budget tier (2 GB budget)",
|
||
)
|
||
|
||
mo.vstack([
|
||
mo.md("### Assign a Compression Strategy to Each Deployment Tier"),
|
||
mo.md("""
|
||
Select a compression configuration for each tier. The Compression Trade-off
|
||
Frontier below will update to show where each choice sits relative to the
|
||
Pareto-optimal boundary.
|
||
"""),
|
||
mo.hstack([act2_flagship, act2_midrange, act2_budget], justify="start", gap="1.5rem"),
|
||
])
|
||
return (act2_flagship, act2_midrange, act2_budget)
|
||
|
||
|
||
# ── CELL 19: COMPRESSION FRONTIER PLOT + TIER METRICS ─────────────────────────
|
||
@app.cell(hide_code=True)
|
||
def _(
|
||
mo, go, COLORS,
|
||
act2_flagship, act2_midrange, act2_budget,
|
||
LLAMA3_8B_PPL,
|
||
):
|
||
# ── LLaMA-3 8B compression data ───────────────────────────────────────────
|
||
# Source: @sec-optimizations-model-compression — empirical compression curves.
|
||
# Perplexity (lower = better): FP32 baseline = 6.14 (WikiText-2).
|
||
# Calibrated to published llama.cpp / GGUF / bitsandbytes benchmarks.
|
||
#
|
||
# Tuple structure: (label, method_key, size_gb, perplexity, pareto_flag)
|
||
_CONFIGS = [
|
||
("FP32 (baseline)", "fp32", 32.0, 6.14, True ),
|
||
("FP16", "fp16", 16.0, 6.16, True ),
|
||
("INT8", "int8", 8.0, 6.21, True ),
|
||
("INT4", "int4", 4.0, 6.47, True ),
|
||
("INT4 + 10% struct. pruning", "int4_prune10", 3.6, 6.63, True ),
|
||
("INT4 + 30% struct. pruning", "int4_prune30", 2.8, 7.12, True ),
|
||
("INT4 + 50% struct. pruning", "int4_prune50", 2.0, 8.05, True ),
|
||
("INT4 + 50% unstruct. pruning", "int4_prune50_unstruct", 2.0, 7.85, False),
|
||
("Distilled 4B, INT8", "distil_4b_int8", 4.1, 7.30, False),
|
||
("Distilled 1B, INT8", "distil_1b_int8", 1.0, 9.80, False),
|
||
]
|
||
|
||
_CONFIG_MAP = {c[1]: c for c in _CONFIGS}
|
||
|
||
_TIER_BUDGETS = {"flagship": 8.0, "midrange": 4.0, "budget": 2.0}
|
||
_TIER_KEYS = {
|
||
"flagship": act2_flagship.value,
|
||
"midrange": act2_midrange.value,
|
||
"budget": act2_budget.value,
|
||
}
|
||
_TIER_COLORS = {
|
||
"flagship": COLORS["GreenLine"],
|
||
"midrange": COLORS["OrangeLine"],
|
||
"budget": COLORS["RedLine"],
|
||
}
|
||
_TIER_LABELS = {"flagship": "Flagship", "midrange": "Mid-range", "budget": "Budget"}
|
||
|
||
_bg_x, _bg_y, _bg_text = [], [], []
|
||
_pf_x, _pf_y, _pf_text = [], [], []
|
||
|
||
for _label_c, _key_c, _sz_c, _ppl_c, _on_pareto_c in _CONFIGS:
|
||
if _on_pareto_c:
|
||
_pf_x.append(_sz_c)
|
||
_pf_y.append(_ppl_c)
|
||
_pf_text.append(_label_c)
|
||
else:
|
||
_bg_x.append(_sz_c)
|
||
_bg_y.append(_ppl_c)
|
||
_bg_text.append(_label_c)
|
||
|
||
_pf_sorted = sorted(zip(_pf_x, _pf_y, _pf_text))
|
||
_pf_x_s = [p[0] for p in _pf_sorted]
|
||
_pf_y_s = [p[1] for p in _pf_sorted]
|
||
|
||
_fig2 = go.Figure()
|
||
|
||
_fig2.add_trace(go.Scatter(
|
||
x=_bg_x, y=_bg_y,
|
||
mode="markers",
|
||
name="Dominated (off-frontier)",
|
||
marker=dict(color="#94a3b8", size=10, symbol="circle-open", line=dict(width=2)),
|
||
text=_bg_text,
|
||
hovertemplate="<b>%{text}</b><br>Size: %{x:.1f} GB<br>Perplexity: %{y:.2f}<extra></extra>",
|
||
))
|
||
|
||
_fig2.add_trace(go.Scatter(
|
||
x=_pf_x_s, y=_pf_y_s,
|
||
mode="lines",
|
||
name="Pareto frontier",
|
||
line=dict(color=COLORS["BlueLine"], width=2, dash="dot"),
|
||
showlegend=True,
|
||
hoverinfo="skip",
|
||
))
|
||
|
||
_fig2.add_trace(go.Scatter(
|
||
x=_pf_x, y=_pf_y,
|
||
mode="markers",
|
||
name="Pareto-optimal",
|
||
marker=dict(color=COLORS["BlueLine"], size=11, symbol="circle",
|
||
line=dict(color="white", width=2)),
|
||
text=_pf_text,
|
||
hovertemplate="<b>%{text}</b><br>Size: %{x:.1f} GB<br>Perplexity: %{y:.2f}<extra></extra>",
|
||
))
|
||
|
||
for _tier_n, _budget_gb in _TIER_BUDGETS.items():
|
||
_fig2.add_vline(
|
||
x=_budget_gb,
|
||
line_color=_TIER_COLORS[_tier_n],
|
||
line_width=1.5,
|
||
line_dash="dash",
|
||
annotation_text=f"{_TIER_LABELS[_tier_n]} ({_budget_gb:.0f} GB)",
|
||
annotation_font_color=_TIER_COLORS[_tier_n],
|
||
annotation_position="top",
|
||
)
|
||
|
||
for _tier_n2, _sel_key in _TIER_KEYS.items():
|
||
if _sel_key in _CONFIG_MAP:
|
||
_c2 = _CONFIG_MAP[_sel_key]
|
||
_fig2.add_trace(go.Scatter(
|
||
x=[_c2[2]], y=[_c2[3]],
|
||
mode="markers+text",
|
||
name=f"{_TIER_LABELS[_tier_n2]} selection",
|
||
marker=dict(color=_TIER_COLORS[_tier_n2], size=18,
|
||
symbol="star", line=dict(color="white", width=2)),
|
||
text=[_TIER_LABELS[_tier_n2]],
|
||
textposition="top center",
|
||
textfont=dict(size=11, color=_TIER_COLORS[_tier_n2]),
|
||
hovertemplate=(
|
||
f"<b>{_TIER_LABELS[_tier_n2]}: {_c2[0]}</b>"
|
||
f"<br>Size: {_c2[2]:.1f} GB<br>Perplexity: {_c2[3]:.2f}"
|
||
f"<extra></extra>"
|
||
),
|
||
))
|
||
|
||
_fig2.update_layout(
|
||
xaxis=dict(
|
||
title="Model Size (GB)",
|
||
type="log",
|
||
tickvals=[1, 2, 4, 8, 16, 32],
|
||
ticktext=["1 GB", "2 GB", "4 GB", "8 GB", "16 GB", "32 GB"],
|
||
gridcolor="#f1f5f9", linecolor=COLORS["Border"],
|
||
range=[-0.05, 1.55],
|
||
),
|
||
yaxis=dict(
|
||
title="Perplexity on WikiText-2 (lower = better)",
|
||
gridcolor="#f1f5f9", linecolor=COLORS["Border"],
|
||
range=[5.8, 10.5],
|
||
),
|
||
height=500,
|
||
plot_bgcolor="white",
|
||
paper_bgcolor="white",
|
||
font_family="Inter, sans-serif",
|
||
font_color=COLORS["Text"],
|
||
margin=dict(l=60, r=40, t=30, b=60),
|
||
legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="left", x=0),
|
||
)
|
||
|
||
# ── Per-tier metric cards ──────────────────────────────────────────────────
|
||
_tier_cards_html = ""
|
||
for _tier_n3, _sel_key3 in _TIER_KEYS.items():
|
||
_budget3 = _TIER_BUDGETS[_tier_n3]
|
||
_accent3 = _TIER_COLORS[_tier_n3]
|
||
if _tier_n3 == "flagship":
|
||
_bg3, _bd3 = "#f0fdf4", "#bbf7d0"
|
||
elif _tier_n3 == "midrange":
|
||
_bg3, _bd3 = "#fff7ed", "#fed7aa"
|
||
else:
|
||
_bg3, _bd3 = "#fef2f2", "#fecaca"
|
||
|
||
if _sel_key3 in _CONFIG_MAP:
|
||
_c3 = _CONFIG_MAP[_sel_key3]
|
||
_sz3 = _c3[2]
|
||
_ppl3 = _c3[3]
|
||
_pareto3 = _c3[4]
|
||
_fits3 = _sz3 <= _budget3
|
||
_ppl_delta3 = _ppl3 - LLAMA3_8B_PPL
|
||
_ppl_col3 = "#008F45" if _ppl_delta3 < 0.3 else ("#CC5500" if _ppl_delta3 < 1.5 else "#CB202D")
|
||
_fit_str3 = f"{_sz3:.1f} GB (fits)" if _fits3 else f"{_sz3:.1f} GB — EXCEEDS {_budget3:.0f} GB"
|
||
_fit_col3 = "#008F45" if _fits3 else "#CB202D"
|
||
_pb_badge = (
|
||
'<span style="background:#f0fdf4; border:1px solid #bbf7d0; padding:1px 7px; '
|
||
'border-radius:4px; font-weight:700; color:#008F45; font-size:0.72rem;">Pareto-optimal</span>'
|
||
if _pareto3 else
|
||
'<span style="background:#fef2f2; border:1px solid #fecaca; padding:1px 7px; '
|
||
'border-radius:4px; font-weight:700; color:#CB202D; font-size:0.72rem;">Off-frontier</span>'
|
||
)
|
||
else:
|
||
_sz3, _ppl3, _ppl_delta3 = 0.0, 0.0, 0.0
|
||
_fit_str3, _fit_col3, _ppl_col3, _pb_badge = "Unknown", "#94a3b8", "#94a3b8", ""
|
||
|
||
_tier_cards_html += (
|
||
f'<div style="background:{_bg3}; border:1px solid {_bd3}; border-top:4px solid {_accent3};'
|
||
f'border-radius:8px; padding:14px 16px; flex:1; min-width:180px;">'
|
||
f'<div style="font-weight:800; color:{_accent3}; font-size:0.85rem; margin-bottom:8px;">'
|
||
f'{_TIER_LABELS[_tier_n3]} ({_budget3:.0f} GB budget)</div>'
|
||
f'<div style="font-size:0.82rem; line-height:1.9;">'
|
||
f'<div><span style="color:#475569; font-weight:600;">Strategy:</span> '
|
||
f'<span style="font-family:monospace; color:#0f172a;">{_sel_key3}</span></div>'
|
||
f'<div><span style="color:#475569; font-weight:600;">Size:</span> '
|
||
f'<span style="font-family:monospace; color:{_fit_col3}; font-weight:700;">{_fit_str3}</span></div>'
|
||
f'<div><span style="color:#475569; font-weight:600;">Perplexity:</span> '
|
||
f'<span style="font-family:monospace; color:{_ppl_col3}; font-weight:700;">'
|
||
f'{_ppl3:.2f} (+{_ppl_delta3:.2f} vs FP32)</span></div>'
|
||
f'<div style="margin-top:4px;">{_pb_badge}</div>'
|
||
f'</div></div>'
|
||
)
|
||
|
||
mo.vstack([
|
||
mo.md("### Compression Trade-off Frontier — LLaMA-3 8B"),
|
||
mo.md("""
|
||
Each point is a compression configuration. **Blue dots** lie on the Pareto
|
||
frontier — where you cannot improve quality without increasing size.
|
||
**Star markers** show your selections. Dashed vertical lines mark each
|
||
tier's memory budget.
|
||
"""),
|
||
mo.plotly(_fig2),
|
||
mo.md("#### Per-Tier Metric Summary"),
|
||
mo.Html(f'<div style="display:flex; gap:14px; flex-wrap:wrap; margin:12px 0;">{_tier_cards_html}</div>'),
|
||
])
|
||
return
|
||
|
||
|
||
# ── CELL 20: FAILURE STATE (OOM DETECTION) ────────────────────────────────────
|
||
@app.cell(hide_code=True)
|
||
def _(mo, act2_flagship, act2_midrange, act2_budget):
|
||
_SIZE_MAP_OOM = {
|
||
"fp32": 32.0, "fp16": 16.0, "int8": 8.0, "int4": 4.0,
|
||
"int4_prune10": 3.6, "int4_prune30": 2.8, "int4_prune50": 2.0,
|
||
"int4_prune50_unstruct": 2.0, "distil_4b_int8": 4.1, "distil_1b_int8": 1.0,
|
||
}
|
||
_BUDGETS_OOM = {"flagship": 8.0, "midrange": 4.0, "budget": 2.0}
|
||
_TIERS_OOM = {
|
||
"flagship": act2_flagship.value,
|
||
"midrange": act2_midrange.value,
|
||
"budget": act2_budget.value,
|
||
}
|
||
_TIER_LABELS_OOM = {"flagship": "Flagship", "midrange": "Mid-range", "budget": "Budget"}
|
||
|
||
_violations = []
|
||
for _tier_oom, _key_oom in _TIERS_OOM.items():
|
||
_sz_oom = _SIZE_MAP_OOM.get(_key_oom, 0.0)
|
||
if _sz_oom > _BUDGETS_OOM[_tier_oom]:
|
||
_violations.append((_tier_oom, _sz_oom, _BUDGETS_OOM[_tier_oom]))
|
||
|
||
_widgets_oom = []
|
||
for _tier_v, _req_v, _avail_v in _violations:
|
||
_widgets_oom.append(
|
||
mo.callout(
|
||
mo.md(
|
||
f"**OOM — Infeasible for {_TIER_LABELS_OOM[_tier_v]} tier.** "
|
||
f"Required: **{_req_v:.1f} GB** | Available: **{_avail_v:.0f} GB** | "
|
||
f"Overflow: **{_req_v - _avail_v:.1f} GB over budget.** "
|
||
f"Select a more aggressive compression scheme for this tier."
|
||
),
|
||
kind="danger",
|
||
)
|
||
)
|
||
|
||
if _widgets_oom:
|
||
mo.vstack(_widgets_oom)
|
||
else:
|
||
mo.callout(
|
||
mo.md(
|
||
"**All tiers within budget.** "
|
||
"Every selected configuration fits within its deployment tier memory limit."
|
||
),
|
||
kind="success",
|
||
)
|
||
return
|
||
|
||
|
||
# ── CELL 21: ACT II PREDICTION FEEDBACK ───────────────────────────────────────
|
||
@app.cell(hide_code=True)
|
||
def _(mo, act2_prediction):
|
||
_pred2 = act2_prediction.value
|
||
_FEEDBACK_A2 = {
|
||
"A": (
|
||
"**Not quite.** INT8 gives an 8 GB footprint for LLaMA-3 8B — exactly the "
|
||
"Flagship budget. But the Mid-range tier has only 4 GB and the Budget tier "
|
||
"only 2 GB. A single INT8 strategy violates both smaller tiers. You need a "
|
||
"tiered approach that applies progressively stronger compression as the budget "
|
||
"shrinks.",
|
||
False,
|
||
),
|
||
"B": (
|
||
"**Correct.** This is the Pareto-optimal tiered allocation. "
|
||
"Flagship at INT8 uses the full 8 GB budget with minimal accuracy loss. "
|
||
"Mid-range at INT4 hits exactly the 4 GB constraint. "
|
||
"Budget at INT4 + 50% structured pruning reaches the 2 GB ceiling while "
|
||
"remaining on the Pareto frontier — structured pruning removes entire "
|
||
"attention heads and MLP blocks, so the compressed model still runs "
|
||
"efficiently on dense hardware.",
|
||
True,
|
||
),
|
||
"C": (
|
||
"**Not correct.** Unstructured pruning sets individual weights to zero but "
|
||
"leaves the tensor dimensions unchanged. Dense matrix kernels on NPUs execute "
|
||
"the same number of MAC operations regardless of how many are zero — the "
|
||
"hardware does not skip zeros. The result is a compressed file but not a "
|
||
"faster computation. Structured pruning removes entire rows/columns, which "
|
||
"genuinely reduces the arithmetic and fits the model into a smaller budget.",
|
||
False,
|
||
),
|
||
"D": (
|
||
"**Not correct.** Knowledge distillation is a valid axis, but it is not required "
|
||
"here. INT4 + structured pruning reaches 2 GB for LLaMA-3 8B while remaining on "
|
||
"the Pareto frontier. Distilling a separate 1B model produces a fundamentally "
|
||
"different model with different capabilities — appropriate only when quality "
|
||
"degradation from pruning is unacceptable.",
|
||
False,
|
||
),
|
||
}
|
||
|
||
_text2, _correct2 = _FEEDBACK_A2[_pred2]
|
||
mo.callout(mo.md(_text2), kind="success" if _correct2 else "warn")
|
||
return
|
||
|
||
|
||
# ── CELL 22: ACT II MATHPEEK ──────────────────────────────────────────────────
|
||
@app.cell(hide_code=True)
|
||
def _(mo):
|
||
mo.accordion({
|
||
"The governing equations — Compression Ratio and Pareto Frontier": mo.md("""
|
||
**Compression ratio** (@sec-optimizations-model-compression):
|
||
|
||
```
|
||
CR = Size_FP32 / Size_compressed
|
||
= (N_params x 4 bytes) / (N_params x bytes_per_value x (1 - sparsity))
|
||
```
|
||
|
||
For INT8 + 0% pruning: CR = 4 / 1.0 = **4x**
|
||
|
||
For INT4 + 50% structured pruning: CR = 4 / (0.5 x 0.5) = **16x**
|
||
|
||
**Quantization error bound (uniform per-tensor):**
|
||
|
||
```
|
||
|epsilon_Q| <= max(|x|) / (2 x (2^bits - 1))
|
||
```
|
||
|
||
INT8: max_err = range / 510 — approximately 0.2% of range
|
||
|
||
INT4: max_err = range / 30 — approximately 3.3% of range
|
||
|
||
**Structured vs unstructured pruning:**
|
||
|
||
```
|
||
Structured: removes complete rows/columns -> dense submatrix -> hardware efficient
|
||
Unstructured: zeros individual elements -> sparse matrix -> dense kernel unchanged
|
||
```
|
||
|
||
Structured pruning at sparsity `s` reduces MACs by exactly `s`:
|
||
|
||
```
|
||
MACs_pruned = MACs_dense x (1 - sparsity)
|
||
```
|
||
|
||
Unstructured pruning at sparsity `s` reduces latency ONLY when specialized
|
||
sparse kernels are available (e.g., NVIDIA A100 2:4 sparsity). Without hardware
|
||
support, latency is **unchanged** even at 90% sparsity.
|
||
|
||
**Empirical accuracy-size tradeoff law:**
|
||
|
||
```
|
||
delta_perplexity ≈ alpha x log2(CR)
|
||
```
|
||
|
||
where alpha ≈ 0.12 for LLaMA-class models at moderate compression ratios.
|
||
This log relationship explains why compression becomes increasingly costly
|
||
as you push toward extreme ratios (INT2, very high sparsity).
|
||
"""),
|
||
})
|
||
return
|
||
|
||
|
||
# ── CELL 23: ACT II REFLECTION ────────────────────────────────────────────────
|
||
@app.cell(hide_code=True)
|
||
def _(mo):
|
||
act2_reflection = mo.ui.radio(
|
||
options={
|
||
"A) Pruning removes too many weights, making the model too small to function": "A",
|
||
"B) Sparse operations require special hardware support — dense kernels execute zero weights unchanged": "B",
|
||
"C) Pruned models cannot subsequently be quantized": "C",
|
||
"D) Unstructured pruning always hurts accuracy more than quantization": "D",
|
||
},
|
||
label="Reflection: Why is unstructured pruning often hardware-inefficient in practice?",
|
||
)
|
||
mo.vstack([
|
||
mo.md("---"),
|
||
mo.md("### Reflection — Act II"),
|
||
act2_reflection,
|
||
])
|
||
return (act2_reflection,)
|
||
|
||
|
||
# ── CELL 24: ACT II REFLECTION FEEDBACK ──────────────────────────────────────
|
||
@app.cell(hide_code=True)
|
||
def _(mo, act2_reflection):
|
||
mo.stop(
|
||
act2_reflection.value is None,
|
||
mo.callout(mo.md("Select an answer to see the explanation."), kind="warn"),
|
||
)
|
||
|
||
_REFL2 = {
|
||
"A": (
|
||
"**Not correct.** Pruning does not make a model too small to function — "
|
||
"it creates sparse weights within the original tensor dimensions. The model "
|
||
"architecture is unchanged; only individual weight values are forced to zero. "
|
||
"The hardware-efficiency problem is not about model capacity; it is about "
|
||
"whether the underlying arithmetic can exploit those zeros.",
|
||
False,
|
||
),
|
||
"B": (
|
||
"**Correct.** Dense matrix multiply kernels (GEMM) on GPUs and NPUs are "
|
||
"designed for dense inputs. A kernel computing `C = A x B` iterates over "
|
||
"every element of A — including the zeros introduced by unstructured pruning. "
|
||
"The operation count is identical to the unpruned case. Memory bandwidth "
|
||
"savings require the weights to be stored sparsely (e.g., CSR format), but "
|
||
"even that requires a sparse GEMM kernel. NVIDIA A100 supports 2:4 structured "
|
||
"sparsity natively; arbitrary unstructured sparsity on mobile NPUs typically "
|
||
"provides zero latency benefit.",
|
||
True,
|
||
),
|
||
"C": (
|
||
"**Not correct.** Pruning and quantization are orthogonal techniques. "
|
||
"A pruned model — whether structured or unstructured — can be quantized "
|
||
"afterward. INT4 + structured pruning is a standard production combination "
|
||
"precisely because each technique acts on a different aspect of the model "
|
||
"(precision vs. architectural width).",
|
||
False,
|
||
),
|
||
"D": (
|
||
"**Not correct.** The accuracy impact of unstructured pruning depends heavily "
|
||
"on sparsity level and model type. At moderate sparsities (10–30%), unstructured "
|
||
"pruning often hurts accuracy less than INT4 quantization. The problem is not "
|
||
"accuracy — it is that you cannot exploit the sparsity for latency improvement "
|
||
"without specialized sparse kernels. The hardware efficiency problem is "
|
||
"independent of the accuracy impact.",
|
||
False,
|
||
),
|
||
}
|
||
|
||
_text_r2, _correct_r2 = _REFL2[act2_reflection.value]
|
||
mo.callout(mo.md(_text_r2), kind="success" if _correct_r2 else "warn")
|
||
return
|
||
|
||
|
||
# ═════════════════════════════════════════════════════════════════════════════
|
||
# LEDGER SAVE + HUD FOOTER
|
||
# ═════════════════════════════════════════════════════════════════════════════
|
||
|
||
# ── CELL 25: LEDGER SAVE + HUD ────────────────────────────────────────────────
|
||
@app.cell(hide_code=True)
|
||
def _(
|
||
mo, ledger, COLORS,
|
||
context_toggle,
|
||
act1_prediction,
|
||
act2_flagship, act2_midrange, act2_budget,
|
||
LLAMA3_8B_FP32_GB,
|
||
):
|
||
_ctx_hud = context_toggle.value
|
||
|
||
_a1_pred_hud = act1_prediction.value or "unanswered"
|
||
_a1_correct_hud = _a1_pred_hud == "C"
|
||
|
||
_SIZE_MAP_HUD = {
|
||
"fp32": 32.0, "fp16": 16.0, "int8": 8.0, "int4": 4.0,
|
||
"int4_prune10": 3.6, "int4_prune30": 2.8, "int4_prune50": 2.0,
|
||
"int4_prune50_unstruct": 2.0, "distil_4b_int8": 4.1, "distil_1b_int8": 1.0,
|
||
}
|
||
_PARETO_HUD = {
|
||
"fp32": True, "fp16": True, "int8": True, "int4": True,
|
||
"int4_prune10": True, "int4_prune30": True, "int4_prune50": True,
|
||
"int4_prune50_unstruct": False, "distil_4b_int8": False, "distil_1b_int8": False,
|
||
}
|
||
_BUDGETS_HUD = {"flagship": 8.0, "midrange": 4.0, "budget": 2.0}
|
||
_SELECTED_HUD = {
|
||
"flagship": act2_flagship.value,
|
||
"midrange": act2_midrange.value,
|
||
"budget": act2_budget.value,
|
||
}
|
||
|
||
_flagship_gb_hud = _SIZE_MAP_HUD.get(_SELECTED_HUD["flagship"], 0.0)
|
||
_midrange_gb_hud = _SIZE_MAP_HUD.get(_SELECTED_HUD["midrange"], 0.0)
|
||
_budget_gb_hud = _SIZE_MAP_HUD.get(_SELECTED_HUD["budget"], 0.0)
|
||
|
||
_constraint_hit_hud = (
|
||
_flagship_gb_hud > 8.0 or _midrange_gb_hud > 4.0 or _budget_gb_hud > 2.0
|
||
)
|
||
_compression_method_hud = _SELECTED_HUD["flagship"]
|
||
_compression_ratio_hud = LLAMA3_8B_FP32_GB / max(_flagship_gb_hud, 0.01)
|
||
_pareto_optimal_hud = all(
|
||
_PARETO_HUD.get(k, False) for k in _SELECTED_HUD.values()
|
||
)
|
||
|
||
ledger.save(
|
||
chapter=10,
|
||
design={
|
||
"context": _ctx_hud,
|
||
"compression_method": _compression_method_hud,
|
||
"compression_ratio": round(_compression_ratio_hud, 2),
|
||
"act1_prediction": _a1_pred_hud,
|
||
"act1_correct": _a1_correct_hud,
|
||
"act2_result": _budget_gb_hud,
|
||
"act2_decision": (
|
||
f"flagship={_SELECTED_HUD['flagship']};"
|
||
f"mid={_SELECTED_HUD['midrange']};"
|
||
f"budget={_SELECTED_HUD['budget']}"
|
||
),
|
||
"constraint_hit": _constraint_hit_hud,
|
||
"pareto_optimal": _pareto_optimal_hud,
|
||
},
|
||
)
|
||
|
||
# ── HUD color coding ──────────────────────────────────────────────────────
|
||
_green = "#4ade80"
|
||
_red = "#f87171"
|
||
_yellow = "#fbbf24"
|
||
_muted = "#94a3b8"
|
||
|
||
_a1_icon = _green if _a1_correct_hud else _yellow
|
||
_oom_icon = _red if _constraint_hit_hud else _green
|
||
_pf_icon = _green if _pareto_optimal_hud else _yellow
|
||
|
||
mo.vstack([
|
||
mo.md("---"),
|
||
mo.Html(f"""
|
||
<div style="background:#0f172a; border-radius:12px; padding:16px 28px;
|
||
margin-top:24px; border:1px solid #1e293b;
|
||
font-family:'SF Mono', 'Fira Code', monospace; font-size:0.8rem;">
|
||
<div style="color:#475569; font-size:0.68rem; font-weight:700;
|
||
text-transform:uppercase; letter-spacing:0.12em; margin-bottom:12px;">
|
||
Design Ledger — Chapter 10 Saved
|
||
</div>
|
||
<div style="display:flex; gap:32px; flex-wrap:wrap; align-items:center;">
|
||
<div>
|
||
<span style="color:{_muted}; font-weight:600;">CONTEXT</span>
|
||
<span style="color:#e2e8f0;">{_ctx_hud.upper()}</span>
|
||
</div>
|
||
<div>
|
||
<span style="color:{_muted}; font-weight:600;">ACT I PREDICTION</span>
|
||
<span style="color:{_a1_icon};">
|
||
{_a1_pred_hud} — {'CORRECT' if _a1_correct_hud else 'INCORRECT'}
|
||
</span>
|
||
</div>
|
||
<div>
|
||
<span style="color:{_muted}; font-weight:600;">COMPRESSION</span>
|
||
<span style="color:#e2e8f0;">
|
||
{_compression_method_hud} ({_compression_ratio_hud:.1f}x)
|
||
</span>
|
||
</div>
|
||
<div>
|
||
<span style="color:{_muted}; font-weight:600;">OOM HIT</span>
|
||
<span style="color:{_oom_icon};">
|
||
{'YES' if _constraint_hit_hud else 'NO'}
|
||
</span>
|
||
</div>
|
||
<div>
|
||
<span style="color:{_muted}; font-weight:600;">PARETO-OPTIMAL</span>
|
||
<span style="color:{_pf_icon};">
|
||
{'YES' if _pareto_optimal_hud else 'NO'}
|
||
</span>
|
||
</div>
|
||
<div>
|
||
<span style="color:{_muted}; font-weight:600;">BUDGET TIER</span>
|
||
<span style="color:#e2e8f0;">
|
||
{_budget_gb_hud:.1f} GB / 2 GB limit
|
||
</span>
|
||
</div>
|
||
</div>
|
||
</div>
|
||
"""),
|
||
mo.callout(
|
||
mo.md(
|
||
"**Lab 10 complete.** Your compression decisions are saved to the Design Ledger "
|
||
"and will be referenced in Lab 11 (Hardware Acceleration — Roofline Model), "
|
||
"where you will compute the arithmetic intensity of your compressed model and "
|
||
"see where it falls relative to the memory bandwidth and compute ceilings."
|
||
),
|
||
kind="success" if (not _constraint_hit_hud and _pareto_optimal_hud) else "info",
|
||
),
|
||
])
|
||
return
|
||
|
||
|
||
if __name__ == "__main__":
|
||
app.run()
|