fix: resolve cross-cell export gaps found during comprehensive HTML build verification

After the class-based namespace isolation pass, missing EXPORTS bridge variables were discovered by running all chapters through the HTML build pipeline. Vol1 fixes: - nn_computation: add hog_grid_str/hog_bins_str exports; convert generator expressions to for-loops (Python 3 class scope skips class namespace); add mnist_large/small_l1/l2 exports for footnote inline Python - ml_systems: add cloud_compute/memory/ai_frac, mobile_tops/bw/ratio/ bottleneck/compute/memory_frac, cloud_thresh_bw_str, edge_thresh_bw_str exports; complete ResnetMobile EXPORTS section - data_selection: fix FpScalingCalc invariant (min_samples_threshold 50→150 so 100 expected rare samples < 150 threshold holds true) - model_compression: FusionCalc bandwidth_reduction invariant 50→40% - nn_architectures: add 'param' unit to lighthouse-table-specs imports Vol2 fixes: - data_storage: add missing 'watt' import to chapter setup cell - fault_tolerance: export per_node_gbs raw float for prose arithmetic - appendix_fleet: export rho_7b raw float for fmt() call in prose - appendix_c3: add .magnitude to calc_effective_flops() result (returns Quantity since formulas.py upgrade, not raw float) - appendix_reliability: wrap worked-example-young-daly in class with EXPORTS All 43 chapters with Python cells verified passing after fixes.
2026-04-30 17:48:27 -05:00 · 2026-02-21 14:20:43 -05:00
parent 5677633b4c
commit b887b91a2c
10 changed files with 2928 additions and 1729 deletions
--- a/book/quarto/contents/vol1/data_selection/data_selection.qmd
+++ b/book/quarto/contents/vol1/data_selection/data_selection.qmd
--- a/book/quarto/contents/vol1/ml_systems/ml_systems.qmd
+++ b/book/quarto/contents/vol1/ml_systems/ml_systems.qmd
--- a/book/quarto/contents/vol1/nn_architectures/nn_architectures.qmd
+++ b/book/quarto/contents/vol1/nn_architectures/nn_architectures.qmd
@@ -219,7 +219,7 @@ The quantitative characteristics of these Lighthouse models expose a critical en
 from mlsys import Hardware, Models
 from mlsys.constants import (
-    A100_MEM_CAPACITY, BYTES_FP32, BYTES_FP16, Mparam, Bparam, Kparam, GFLOPs, MFLOPs, MB, GB, KB, GiB
+    A100_MEM_CAPACITY, BYTES_FP32, BYTES_FP16, param, Mparam, Bparam, Kparam, GFLOPs, MFLOPs, MB, GB, KB, GiB
 )
 from mlsys.formatting import fmt, check
 from mlsys.formulas import model_memory
@@ -242,35 +242,35 @@ class LighthouseSpecs:
    # ┌── 2. CALCULATION (The Physics) ─────────────────────────────────────────
    # ResNet-50
-    resnet_params = m_resnet.parameters.to(Mparam).magnitude
+    resnet_params = m_resnet.parameters.m_as(Mparam)
-    resnet_flops = m_resnet.inference_flops.to(GFLOPs).magnitude
+    resnet_flops = m_resnet.inference_flops.m_as(GFLOPs)
-    resnet_mem_mb = m_resnet.size_in_bytes(BYTES_FP32).to(MB).magnitude
+    resnet_mem_mb = m_resnet.size_in_bytes(BYTES_FP32).m_as(MB)
    # GPT-2 XL
-    gpt2_params = m_gpt2.parameters.to(Bparam).magnitude
+    gpt2_params = m_gpt2.parameters.m_as(Bparam)
    gpt2_flops_token = 3.0 # Approximate
-    gpt2_mem_gb = m_gpt2.size_in_bytes(BYTES_FP32).to(GB).magnitude
+    gpt2_mem_gb = m_gpt2.size_in_bytes(BYTES_FP32).m_as(GB)
    # DLRM
    dlrm_entries_b = 25.0 # 25B entries
-    dlrm_mem_gb = m_dlrm.model_size.to(GB).magnitude
+    dlrm_mem_gb = m_dlrm.model_size.m_as(GB)
    # MobileNetV2
-    mobilenet_params = m_mobilenet.parameters.to(Mparam).magnitude
+    mobilenet_params = m_mobilenet.parameters.m_as(Mparam)
-    mobilenet_flops = m_mobilenet.inference_flops.to(MFLOPs).magnitude
+    mobilenet_flops = m_mobilenet.inference_flops.m_as(MFLOPs)
-    mobilenet_mem_mb = m_mobilenet.size_in_bytes(BYTES_FP32).to(MB).magnitude
+    mobilenet_mem_mb = m_mobilenet.size_in_bytes(BYTES_FP32).m_as(MB)
    # KWS (DS-CNN)
-    kws_params_k = m_kws.parameters.to(Kparam).magnitude
+    kws_params_k = m_kws.parameters.m_as(Kparam)
-    kws_flops_m = m_kws.inference_flops.to(MFLOPs).magnitude
+    kws_flops_m = m_kws.inference_flops.m_as(MFLOPs)
-    kws_mem_kb = m_kws.size_in_bytes(BYTES_FP32).to(KB).magnitude
+    kws_mem_kb = m_kws.size_in_bytes(BYTES_FP32).m_as(KB)
    # Ratios
-    mobilenet_size_ratio = m_resnet.parameters.magnitude / m_mobilenet.parameters.magnitude
+    mobilenet_size_ratio = m_resnet.parameters.m_as(param) / m_mobilenet.parameters.m_as(param)
-    mobilenet_flops_ratio = (m_resnet.inference_flops / m_mobilenet.inference_flops).to('count').magnitude
+    mobilenet_flops_ratio = (m_resnet.inference_flops / m_mobilenet.inference_flops).m_as('count')
    # Reference Hardware
-    a100_mem = hw_a100.memory_capacity.to(GiB).magnitude
+    a100_mem = hw_a100.memory_capacity.m_as(GiB)
    # ┌── 3. INVARIANTS (Guardrails) ───────────────────────────────────────────
    # Ensure numbers match the book's narrative
@@ -288,7 +288,7 @@ class LighthouseSpecs:
    gpt2_gflops_per_token_str = fmt(gpt2_flops_token, precision=1)
    # GPT-3 context
-    gpt3_fp16_gb_str = fmt(Models.GPT3.size_in_bytes(BYTES_FP16).to(GB).magnitude, precision=0)
+    gpt3_fp16_gb_str = fmt(Models.GPT3.size_in_bytes(BYTES_FP16).m_as(GB), precision=0)
    dlrm_entries_b_str = fmt(dlrm_entries_b, precision=0)
    dlrm_model_size_gb_str = fmt(dlrm_mem_gb, precision=0)
@@ -490,8 +490,8 @@ class MLPvsCNN:
    check(ratio >= 10, f"MLP ({mlp_p}) isn't significantly larger than CNN ({cnn_p}). Ratio: {ratio}x")
    # ┌── 4. OUTPUTS (Formatting) ──────────────────────────────────────────────
-    mlp_params_str = f"{(mlp_p * param).to(Mparam).magnitude:.0f}M"
+    mlp_params_str = f"{(mlp_p * param).m_as(Mparam):.0f}M"
-    cnn_params_str = f"{(cnn_p * param).to(Kparam).magnitude:.0f}K"
+    cnn_params_str = f"{(cnn_p * param).m_as(Kparam):.0f}K"
    param_ratio_str = f"{ratio}"
 # Note: Use MLPvsCNN.mlp_params_str directly.
@@ -859,10 +859,10 @@ class A100Specs:
    # ┌── 1. PARAMETERS (Inputs) ───────────────────────────────────────────────
    # A100 performance at various precisions
-    fp16_tensor = A100_FLOPS_FP16_TENSOR.to(TFLOPs/second).magnitude
+    fp16_tensor = A100_FLOPS_FP16_TENSOR.m_as(TFLOPs/second)
-    int8_tensor = A100_FLOPS_INT8.to(TFLOPs/second).magnitude
+    int8_tensor = A100_FLOPS_INT8.m_as(TFLOPs/second)
-    fp32_cuda = A100_FLOPS_FP32.to(TFLOPs/second).magnitude
+    fp32_cuda = A100_FLOPS_FP32.m_as(TFLOPs/second)
-    tf32_tensor = A100_FLOPS_TF32.to(TFLOPs/second).magnitude
+    tf32_tensor = A100_FLOPS_TF32.m_as(TFLOPs/second)
    # ┌── 4. OUTPUTS (Formatting) ──────────────────────────────────────────────
    a100_tflops_fp16_str = fmt(fp16_tensor, precision=0, commas=False)
@@ -2364,17 +2364,27 @@ Attention mechanisms create computational patterns that differ significantly fro
 # │ Exports: attn_score_macs_m_str
 # └─────────────────────────────────────────────────────────────────────────────
 from mlsys.constants import MILLION
 from mlsys.formatting import fmt, check
-# --- Inputs (typical attention configuration) ---
+class AttentionComputeCosts:
-attn_seq_len_value = 512                                         # sequence length
+    """Demonstrate quadratic compute cost of self-attention at sequence length 512."""
 attn_head_dim_value = 64                                         # dimension per head
-# --- Computation costs ---
+    # ┌── 1. PARAMETERS (Inputs) ──────────────────────────────────────────────
-attn_score_macs_value = attn_seq_len_value * attn_seq_len_value * attn_head_dim_value
+    seq_len = 512                                                # sequence length
    head_dim = 64                                               # dimension per head
-# --- Outputs (formatted strings for prose) ---
+    # ┌── 2. CALCULATION (The Physics) ────────────────────────────────────────
-attn_score_macs_m_str = fmt(attn_score_macs_value / MILLION, precision=1, commas=False)  # e.g. "16.8"
+    score_macs = seq_len * seq_len * head_dim
    # ┌── 3. INVARIANTS (Guardrails) ──────────────────────────────────────────
    check(score_macs > MILLION, "Attention MACs should exceed 1M for seq_len=512.")
    # ┌── 4. OUTPUTS (Formatting) ─────────────────────────────────────────────
    attn_score_macs_m_str = fmt(score_macs / MILLION, precision=1, commas=False)  # e.g. "16.8"
 # ┌── EXPORTS (Bridge to Text) ─────────────────────────────────────────────────
 attn_score_macs_m_str = AttentionComputeCosts.attn_score_macs_m_str
 ```
 ::: {#lst-attention_layer_compute lst-cap="**Attention Computation**: Two implementations showing the same O(N^2 $\times$ d) complexity. The matrix form (top) uses optimized GEMM, while the nested loops (bottom) expose the quadratic pairwise comparisons: for sequence length 512 and dimension 64, computing attention scores requires 512 $\times$ 512 $\times$ 64 = `{python} attn_score_macs_m_str` million MACs per attention head, plus another `{python} attn_score_macs_m_str`M for value aggregation."}
@@ -2471,7 +2481,7 @@ class AttentionMemory:
    # ┌── 1. PARAMETERS (Inputs) ───────────────────────────────────────────────
    seq_len = 100_000
-    bytes_per_element = BYTES_FP16.magnitude
+    bytes_per_element = BYTES_FP16.m_as(byte)
    num_layers = 32
    num_heads = 12
@@ -2886,7 +2896,7 @@ class DLRMEmbedding:
    # ┌── 2. CALCULATION (The Physics) ─────────────────────────────────────────
    table_bytes = num_users * embed_dim * bytes_per_param
-    table_gb = (table_bytes * byte).to(GB).magnitude
+    table_gb = (table_bytes * byte).m_as(GB)
    # ┌── 3. INVARIANTS (Guardrails) ───────────────────────────────────────────
    check(table_gb >= 80, f"DLRM table ({table_gb:.1f} GB) fits on an A100. It must be larger to justify model parallelism.")
@@ -2964,12 +2974,12 @@ class CapacityWall:
    # ┌── 1. PARAMETERS (Inputs) ───────────────────────────────────────────────
    num_items = 100_000_000
    embed_dim = 128
-    bytes_per_param = BYTES_FP32.magnitude
+    bytes_per_param = BYTES_FP32.m_as(byte)
    # ┌── 2. CALCULATION (The Physics) ─────────────────────────────────────────
    table_bytes = num_items * embed_dim * bytes_per_param
-    table_gb = (table_bytes * byte).to(GB).magnitude
+    table_gb = (table_bytes * byte).m_as(GB)
-    a100_capacity_gb = A100_MEM_CAPACITY.to(GB).magnitude
+    a100_capacity_gb = A100_MEM_CAPACITY.m_as(GB)
    utilization_pct = (table_gb / a100_capacity_gb) * 100
    # ┌── 4. OUTPUTS (Formatting) ──────────────────────────────────────────────
@@ -3166,13 +3176,27 @@ Recall the plain 50-layer network from the analysis above: loss stuck at 1.8, on
 from mlsys.formatting import fmt, check
-# --- Empirical overhead measurements ---
+class ResNetSkipOverhead:
-skip_memory_overhead_pct_value = 20                              # activation storage
+    """Quantify systems cost of residual connections: ~20% memory overhead."""
 skip_epoch_cost_pct_value = 10                                   # per-epoch compute
-# --- Outputs (formatted strings for prose) ---
+    # ┌── 1. PARAMETERS (Inputs) ──────────────────────────────────────────────
-skip_memory_overhead_pct_str = fmt(skip_memory_overhead_pct_value, precision=0, commas=False)  # e.g. "20"
+    memory_overhead_pct = 20                                     # activation storage
-skip_epoch_cost_pct_str = fmt(skip_epoch_cost_pct_value, precision=0, commas=False)            # e.g. "10"
+    epoch_cost_pct = 10                                          # per-epoch compute
    # ┌── 2. CALCULATION (The Physics) ────────────────────────────────────────
    # Values are empirical anchors; no derived calculation needed.
    # ┌── 3. INVARIANTS (Guardrails) ──────────────────────────────────────────
    check(0 < memory_overhead_pct < 100, "Memory overhead must be a valid percentage.")
    check(0 < epoch_cost_pct < 100, "Epoch cost must be a valid percentage.")
    # ┌── 4. OUTPUTS (Formatting) ─────────────────────────────────────────────
    skip_memory_overhead_pct_str = fmt(memory_overhead_pct, precision=0, commas=False)  # e.g. "20"
    skip_epoch_cost_pct_str = fmt(epoch_cost_pct, precision=0, commas=False)            # e.g. "10"
 # ┌── EXPORTS (Bridge to Text) ─────────────────────────────────────────────────
 skip_memory_overhead_pct_str = ResNetSkipOverhead.skip_memory_overhead_pct_str
 skip_epoch_cost_pct_str = ResNetSkipOverhead.skip_epoch_cost_pct_str
 ```
 While skip connections solve gradient flow, they introduce system-level costs. Memory overhead increases because skip connections require storing the input to each residual block for the addition operation during the forward pass and for backpropagation. For a ResNet-50 with batch size 32 processing $224 \times 224$ RGB images, this adds approximately `{python} skip_memory_overhead_pct_str`% memory overhead compared to a plain network. The computational cost of the addition operation ($y = \mathcal{F}(x) + x$) is computationally trivial, adding negligible compute time. The primary cost is the residual function $\mathcal{F}(x)$ itself.
@@ -3654,16 +3678,29 @@ Energy consumption patterns vary dramatically across neural network architecture
 # │ Exports: energy_mac_pj_str, energy_dram_str
 # └─────────────────────────────────────────────────────────────────────────────
-from mlsys.constants import ENERGY_DRAM_ACCESS_PJ
+from mlsys.constants import ENERGY_DRAM_ACCESS_PJ, ureg
 from mlsys.formatting import fmt, check
-# --- Energy costs (from Horowitz 2014) ---
+class EnergyConsumptionAnalysis:
-energy_mac_pj_value = 4.6                                        # pJ per MAC (45nm)
+    """Contrast energy cost of compute vs. data movement: DRAM access is ~5x more costly."""
 energy_dram_value = ENERGY_DRAM_ACCESS_PJ.magnitude              # pJ per 32-bit access
-# --- Outputs (formatted strings for prose) ---
+    # ┌── 1. PARAMETERS (Inputs) ──────────────────────────────────────────────
-energy_mac_pj_str = f"{energy_mac_pj_value}"                     # e.g. "4.6"
+    mac_pj = 4.6                                                # pJ per MAC (Horowitz 2014, 45nm)
-energy_dram_str = fmt(energy_dram_value, precision=0, commas=False)  # e.g. "26"
+    dram_pj = ENERGY_DRAM_ACCESS_PJ.m_as(ureg.picojoule)       # pJ per 32-bit access
    # ┌── 2. CALCULATION (The Physics) ────────────────────────────────────────
    dram_to_mac_ratio = dram_pj / mac_pj
    # ┌── 3. INVARIANTS (Guardrails) ──────────────────────────────────────────
    check(dram_to_mac_ratio > 1, "DRAM access must cost more energy than a MAC.")
    # ┌── 4. OUTPUTS (Formatting) ─────────────────────────────────────────────
    energy_mac_pj_str = f"{mac_pj}"                             # e.g. "4.6"
    energy_dram_str = fmt(dram_pj, precision=0, commas=False)   # e.g. "26"
 # ┌── EXPORTS (Bridge to Text) ─────────────────────────────────────────────────
 energy_mac_pj_str = EnergyConsumptionAnalysis.energy_mac_pj_str
 energy_dram_str = EnergyConsumptionAnalysis.energy_dram_str
 ```
 Dense matrix operations in MLPs achieve excellent arithmetic intensity[^fn-arithmetic-intensity-dnn] (computation per data movement) but consume significant absolute energy. Each multiply-accumulate operation consumes approximately `{python} energy_mac_pj_str` pJ, while data movement from DRAM costs `{python} energy_dram_str` pJ per 32-bit value [@horowitz2014computing]. Given this energy ratio, typical MLP inference spends the majority of its energy budget on data movement rather than computation, making memory bandwidth optimization critical for energy efficiency.
@@ -3745,17 +3782,29 @@ CNNs benefit from specialized convolution algorithms and data layout optimizatio
 from mlsys.formatting import fmt, check
-# --- Standard vs Winograd multiply counts for 3x3 conv ---
+class WinogradCalc:
-std_muls_3x3_value = 9                                           # 3x3 = 9 muls
+    """Demonstrate 2.25x multiplication reduction of Winograd F(2,3) vs standard 3x3 conv."""
 winograd_muls_value = 4                                          # Winograd F(2,3)
-# --- Reduction ratio ---
+    # ┌── 1. PARAMETERS (Inputs) ──────────────────────────────────────────────
-winograd_reduction_value = std_muls_3x3_value / winograd_muls_value
+    std_muls_3x3 = 9                                             # 3x3 = 9 multiplies
    winograd_muls = 4                                            # Winograd F(2,3) multiplies
-# --- Outputs (formatted strings for prose) ---
+    # ┌── 2. CALCULATION (The Physics) ────────────────────────────────────────
-winograd_reduction_str = fmt(winograd_reduction_value, precision=2, commas=False)  # e.g. "2.25"
+    winograd_reduction = std_muls_3x3 / winograd_muls
-std_muls_3x3_str = f"{std_muls_3x3_value}"                       # e.g. "9"
+
-winograd_muls_str = f"{winograd_muls_value}"                     # e.g. "4"
+    # ┌── 3. INVARIANTS (Guardrails) ──────────────────────────────────────────
    check(winograd_reduction > 1, "Winograd must reduce multiply count.")
    check(abs(winograd_reduction - 2.25) < 0.01, "Winograd F(2,3) must yield 2.25x reduction.")
    # ┌── 4. OUTPUTS (Formatting) ─────────────────────────────────────────────
    winograd_reduction_str = fmt(winograd_reduction, precision=2, commas=False)  # e.g. "2.25"
    std_muls_3x3_str = f"{std_muls_3x3}"                         # e.g. "9"
    winograd_muls_str = f"{winograd_muls}"                       # e.g. "4"
 # ┌── EXPORTS (Bridge to Text) ─────────────────────────────────────────────────
 winograd_reduction_str = WinogradCalc.winograd_reduction_str
 std_muls_3x3_str = WinogradCalc.std_muls_3x3_str
 winograd_muls_str = WinogradCalc.winograd_muls_str
 ```
 [^fn-winograd]: **Winograd Algorithms**\index{Winograd Algorithm}: Fast convolution algorithms based on Shmuel Winograd's 1980 work on minimal multiplication complexity. For 3 $\times$ 3 convolutions, Winograd reduces multiply operations from `{python} std_muls_3x3_str` to `{python} winograd_muls_str` per output (`{python} winograd_reduction_str` $\times$ reduction) by trading multiplications for additions, which cost less in terms of both latency and energy. Modern deep learning frameworks like cuDNN automatically select Winograd for appropriate layer configurations, though numerical precision degradation at FP16 limits applicability for mixed-precision training.
@@ -3883,32 +3932,50 @@ This section synthesizes the chapter's concepts through a complete architecture
 from mlsys.formatting import fmt, check
 from mlsys.constants import RESNET50_FLOPs, GFLOPs, TFLOPs
-# --- Inputs (real-time video processing) ---
+class ThroughputCeilingCalc:
-tc_fps_value = 30                                                # target frame rate
+    """Evaluate real-time vision feasibility: ResNet-50 at 30 FPS leaves ample headroom."""
 tc_midrange_gpu_tflops_value = 10                                # reference mid-range GPU
 tc_objdet_gflops_value = 100                                     # object detection model
-# --- Computation ---
+    # ┌── 1. PARAMETERS (Inputs) ──────────────────────────────────────────────
-tc_resnet_gflops_value = RESNET50_FLOPs.to(GFLOPs).magnitude
+    fps = 30                                                     # target frame rate
-tc_sustained_gflops_value = tc_fps_value * tc_resnet_gflops_value
+    midrange_gpu_tflops = 10                                     # reference mid-range GPU (TFLOPS)
-tc_effective_tflops_low_value = tc_midrange_gpu_tflops_value * 0.50   # 50% utilization
+    objdet_gflops = 100                                          # object detection model (GFLOPs)
 tc_effective_tflops_high_value = tc_midrange_gpu_tflops_value * 0.60  # 60% utilization
 tc_headroom_value = tc_effective_tflops_low_value * 1000 / tc_sustained_gflops_value
-tc_objdet_sustained_value = (tc_fps_value * tc_objdet_gflops_value * GFLOPs).to(TFLOPs).magnitude
+    # ┌── 2. CALCULATION (The Physics) ────────────────────────────────────────
-tc_objdet_headroom_value = tc_effective_tflops_low_value / tc_objdet_sustained_value
+    resnet_gflops = RESNET50_FLOPs.m_as(GFLOPs)
    sustained_gflops = fps * resnet_gflops
    effective_tflops_low = midrange_gpu_tflops * 0.50           # 50% utilization
    effective_tflops_high = midrange_gpu_tflops * 0.60          # 60% utilization
    headroom = effective_tflops_low * 1000 / sustained_gflops
-# --- Outputs (formatted strings for prose) ---
+    objdet_sustained_tflops = (fps * objdet_gflops * GFLOPs).m_as(TFLOPs)
-tc_fps_str = f"{tc_fps_value}"                                   # e.g. "30"
+    objdet_headroom = effective_tflops_low / objdet_sustained_tflops
-tc_resnet_gflops_str = fmt(tc_resnet_gflops_value, precision=0, commas=False)  # e.g. "4"
+
-tc_sustained_gflops_str = fmt(tc_sustained_gflops_value, precision=0, commas=False)  # e.g. "123"
+    # ┌── 3. INVARIANTS (Guardrails) ──────────────────────────────────────────
-tc_gpu_tflops_str = f"{tc_midrange_gpu_tflops_value}"            # e.g. "10"
+    check(headroom > 1, "ResNet-50 at 30 FPS must leave compute headroom on a mid-range GPU.")
-tc_effective_low_str = fmt(tc_effective_tflops_low_value, precision=0, commas=False)  # e.g. "5"
+
-tc_effective_high_str = fmt(tc_effective_tflops_high_value, precision=0, commas=False) # e.g. "6"
+    # ┌── 4. OUTPUTS (Formatting) ─────────────────────────────────────────────
-tc_headroom_str = fmt(tc_headroom_value, precision=0, commas=False)  # e.g. "41"
+    tc_fps_str = f"{fps}"                                        # e.g. "30"
-tc_objdet_gflops_str = f"{tc_objdet_gflops_value}"               # e.g. "100"
+    tc_resnet_gflops_str = fmt(resnet_gflops, precision=0, commas=False)       # e.g. "4"
-tc_objdet_sustained_str = fmt(tc_objdet_sustained_value, precision=0, commas=False)  # e.g. "3"
+    tc_sustained_gflops_str = fmt(sustained_gflops, precision=0, commas=False) # e.g. "123"
-tc_objdet_headroom_str = fmt(tc_objdet_headroom_value, precision=0, commas=False)    # e.g. "2"
+    tc_gpu_tflops_str = f"{midrange_gpu_tflops}"                 # e.g. "10"
    tc_effective_low_str = fmt(effective_tflops_low, precision=0, commas=False)  # e.g. "5"
    tc_effective_high_str = fmt(effective_tflops_high, precision=0, commas=False) # e.g. "6"
    tc_headroom_str = fmt(headroom, precision=0, commas=False)   # e.g. "41"
    tc_objdet_gflops_str = f"{objdet_gflops}"                    # e.g. "100"
    tc_objdet_sustained_str = fmt(objdet_sustained_tflops, precision=0, commas=False) # e.g. "3"
    tc_objdet_headroom_str = fmt(objdet_headroom, precision=0, commas=False)    # e.g. "2"
 # ┌── EXPORTS (Bridge to Text) ─────────────────────────────────────────────────
 tc_fps_str = ThroughputCeilingCalc.tc_fps_str
 tc_resnet_gflops_str = ThroughputCeilingCalc.tc_resnet_gflops_str
 tc_sustained_gflops_str = ThroughputCeilingCalc.tc_sustained_gflops_str
 tc_gpu_tflops_str = ThroughputCeilingCalc.tc_gpu_tflops_str
 tc_effective_low_str = ThroughputCeilingCalc.tc_effective_low_str
 tc_effective_high_str = ThroughputCeilingCalc.tc_effective_high_str
 tc_headroom_str = ThroughputCeilingCalc.tc_headroom_str
 tc_objdet_gflops_str = ThroughputCeilingCalc.tc_objdet_gflops_str
 tc_objdet_sustained_str = ThroughputCeilingCalc.tc_objdet_sustained_str
 tc_objdet_headroom_str = ThroughputCeilingCalc.tc_objdet_headroom_str
 ```
 ::: {.callout-notebook title="The Throughput Ceiling"}
@@ -3944,50 +4011,68 @@ tc_objdet_headroom_str = fmt(tc_objdet_headroom_value, precision=0, commas=False
 from mlsys.formatting import fmt, check
 from mlsys.constants import KWS_DSCNN_PARAMS, KWS_DSCNN_FLOPs, Kparam, MFLOPs
-# --- MobileNetV1 specs ---
+class WildlifeModelSizing:
-mnv1_params_m_value = 4.2                                        # millions of params
+    """Select model architecture for constrained edge deployment: MobileNetV2 fits 512 MB."""
 mnv1_flops_mflops_value = 569                                    # MFLOPs at 224x224
-# --- MobileNetV2 (0.75x width) specs ---
+    # ┌── 1. PARAMETERS (Inputs) ──────────────────────────────────────────────
-mnv2_params_m_value = 2.2                                        # millions of params
+    # MobileNetV1 specs
-mnv2_flops_mflops_value = 150                                    # MFLOPs at 224x224
+    mnv1_params_m = 4.2                                          # millions of params
    mnv1_flops_mflops = 569                                      # MFLOPs at 224x224
-# --- Edge deployment power assumptions ---
+    # MobileNetV2 (0.75x width) specs
-inference_power_mw_value = 200                                   # milliwatts during inference
+    mnv2_params_m = 2.2                                          # millions of params
-inference_latency_ms_value = 75                                  # ms per inference
+    mnv2_flops_mflops = 150                                      # MFLOPs at 224x224
 inferences_per_day_value = 100                                   # trigger-based
-# --- Memory calculations ---
+    # Edge deployment power assumptions
-mnv1_fp32_mb_value = mnv1_params_m_value * 4                     # FP32: 4 bytes/param
+    inference_power_mw = 200                                     # milliwatts during inference
-mnv1_int8_mb_value = mnv1_params_m_value * 1                     # INT8: 1 byte/param
+    inference_latency_ms = 75                                    # ms per inference
-mnv2_fp32_mb_value = mnv2_params_m_value * 4
+    inferences_per_day = 100                                     # trigger-based
 mnv2_int8_mb_value = mnv2_params_m_value * 1
-# --- KWS reference (too small for 50-species task) ---
+    # ┌── 2. CALCULATION (The Physics) ────────────────────────────────────────
-kws_example_params_k_value = KWS_DSCNN_PARAMS.to(Kparam).magnitude
+    # Memory footprints
-kws_example_flops_mflops_value = KWS_DSCNN_FLOPs.to(MFLOPs).magnitude
+    mnv1_fp32_mb = mnv1_params_m * 4                             # FP32: 4 bytes/param
    mnv1_int8_mb = mnv1_params_m * 1                             # INT8: 1 byte/param
    mnv2_fp32_mb = mnv2_params_m * 4
    mnv2_int8_mb = mnv2_params_m * 1
-# --- Energy calculations ---
+    # KWS reference (too small for 50-species task)
-energy_per_inf_mj_value = (
+    kws_example_params_k = KWS_DSCNN_PARAMS.m_as(Kparam)
-    inference_power_mw_value * inference_latency_ms_value / 1000
+    kws_example_flops_mflops = KWS_DSCNN_FLOPs.m_as(MFLOPs)
 )
 energy_per_day_j_value = (
    inferences_per_day_value * energy_per_inf_mj_value / 1000
 )
-# --- Outputs (formatted strings for prose) ---
+    # Energy
-mnv1_params_str = fmt(mnv1_params_m_value, precision=1, commas=False)   # e.g. "4.2"
+    energy_per_inf_mj = inference_power_mw * inference_latency_ms / 1000
-mnv1_flops_str = fmt(mnv1_flops_mflops_value, precision=0, commas=False) # e.g. "569"
+    energy_per_day_j = inferences_per_day * energy_per_inf_mj / 1000
-mnv1_fp32_str = fmt(mnv1_fp32_mb_value, precision=0, commas=False)       # e.g. "17"
+
-mnv1_int8_str = fmt(mnv1_int8_mb_value, precision=0, commas=False)       # e.g. "4"
+    # ┌── 3. INVARIANTS (Guardrails) ──────────────────────────────────────────
-mnv2_params_str = fmt(mnv2_params_m_value, precision=1, commas=False)    # e.g. "2.2"
+    check(mnv2_int8_mb < 512, "MobileNetV2 INT8 must fit in 512 MB edge RAM.")
-mnv2_flops_str = fmt(mnv2_flops_mflops_value, precision=0, commas=False) # e.g. "150"
+
-mnv2_fp32_str = fmt(mnv2_fp32_mb_value, precision=0, commas=False)       # e.g. "9"
+    # ┌── 4. OUTPUTS (Formatting) ─────────────────────────────────────────────
-mnv2_int8_str = fmt(mnv2_int8_mb_value, precision=1, commas=False)       # e.g. "2.2"
+    mnv1_params_str = fmt(mnv1_params_m, precision=1, commas=False)   # e.g. "4.2"
-kws_example_params_str = fmt(kws_example_params_k_value, precision=0, commas=False)  # e.g. "26"
+    mnv1_flops_str = fmt(mnv1_flops_mflops, precision=0, commas=False) # e.g. "569"
-kws_example_flops_str = fmt(kws_example_flops_mflops_value, precision=0, commas=False)  # e.g. "6"
+    mnv1_fp32_str = fmt(mnv1_fp32_mb, precision=0, commas=False)       # e.g. "17"
-energy_mj_str = fmt(energy_per_inf_mj_value, precision=0, commas=False)  # e.g. "15"
+    mnv1_int8_str = fmt(mnv1_int8_mb, precision=0, commas=False)       # e.g. "4"
-energy_j_str = fmt(energy_per_day_j_value, precision=1, commas=False)    # e.g. "1.5"
+    mnv2_params_str = fmt(mnv2_params_m, precision=1, commas=False)    # e.g. "2.2"
    mnv2_flops_str = fmt(mnv2_flops_mflops, precision=0, commas=False) # e.g. "150"
    mnv2_fp32_str = fmt(mnv2_fp32_mb, precision=0, commas=False)       # e.g. "9"
    mnv2_int8_str = fmt(mnv2_int8_mb, precision=1, commas=False)       # e.g. "2.2"
    kws_example_params_str = fmt(kws_example_params_k, precision=0, commas=False)       # e.g. "26"
    kws_example_flops_str = fmt(kws_example_flops_mflops, precision=0, commas=False)    # e.g. "6"
    energy_mj_str = fmt(energy_per_inf_mj, precision=0, commas=False)  # e.g. "15"
    energy_j_str = fmt(energy_per_day_j, precision=1, commas=False)    # e.g. "1.5"
 # ┌── EXPORTS (Bridge to Text) ─────────────────────────────────────────────────
 mnv1_params_str = WildlifeModelSizing.mnv1_params_str
 mnv1_flops_str = WildlifeModelSizing.mnv1_flops_str
 mnv1_fp32_str = WildlifeModelSizing.mnv1_fp32_str
 mnv1_int8_str = WildlifeModelSizing.mnv1_int8_str
 mnv2_params_str = WildlifeModelSizing.mnv2_params_str
 mnv2_flops_str = WildlifeModelSizing.mnv2_flops_str
 mnv2_fp32_str = WildlifeModelSizing.mnv2_fp32_str
 mnv2_int8_str = WildlifeModelSizing.mnv2_int8_str
 kws_example_params_str = WildlifeModelSizing.kws_example_params_str
 kws_example_flops_str = WildlifeModelSizing.kws_example_flops_str
 energy_mj_str = WildlifeModelSizing.energy_mj_str
 energy_j_str = WildlifeModelSizing.energy_j_str
 ```
 With the throughput ceiling established, we can now apply the complete decision framework to a realistic scenario that exercises every step.
@@ -4099,11 +4184,23 @@ Engineers add attention to CNNs or convolutions to Transformers expecting additi
 from mlsys.constants import A100_MEM_CAPACITY, GiB
-# --- 8-GPU cluster memory ---
+class A100ClusterMemory:
-a100_8x_mem_value = int(A100_MEM_CAPACITY.to(GiB).magnitude) * 8
+    """Contrast datacenter and edge memory: 8-GPU A100 node vs 4 GB edge device."""
-# --- Outputs (formatted strings for prose) ---
+    # ┌── 1. PARAMETERS (Inputs) ──────────────────────────────────────────────
-a100_8x_mem_str = f"{a100_8x_mem_value}"                         # e.g. "640"
+    n_gpus = 8
    # ┌── 2. CALCULATION (The Physics) ────────────────────────────────────────
    a100_8x_mem = int(A100_MEM_CAPACITY.m_as(GiB)) * n_gpus
    # ┌── 3. INVARIANTS (Guardrails) ──────────────────────────────────────────
    check(a100_8x_mem > 400, "8x A100 cluster should provide >400 GiB memory.")
    # ┌── 4. OUTPUTS (Formatting) ─────────────────────────────────────────────
    a100_8x_mem_str = f"{a100_8x_mem}"                           # e.g. "640"
 # ┌── EXPORTS (Bridge to Text) ─────────────────────────────────────────────────
 a100_8x_mem_str = A100ClusterMemory.a100_8x_mem_str
 ```
 **Pitfall:** *Optimizing architectural decisions for training hardware without considering deployment constraints.*
--- a/book/quarto/contents/vol1/nn_computation/nn_computation.qmd
+++ b/book/quarto/contents/vol1/nn_computation/nn_computation.qmd
--- a/book/quarto/contents/vol1/optimizations/model_compression.qmd
+++ b/book/quarto/contents/vol1/optimizations/model_compression.qmd
@@ -26,7 +26,6 @@ start_chapter("vol1:model_compression")
 :::
 ## Purpose {.unnumbered}
 \begin{marginfigure}
@@ -78,102 +77,137 @@ Bridging that gap requires a systematic discipline of *compression*: trading cap
 from mlsys.constants import *
 from mlsys.formatting import fmt, check, sci
-# --- Inputs (GPU specs) ---
+class CompressionSetup:
-a100_tflops_fp16_value = A100_FLOPS_FP16_TENSOR.to(TFLOPs / second).magnitude
+    """Chapter-wide constants: GPU specs, energy physics, model sizes, device constraints."""
 a100_tflops_int8_value = A100_FLOPS_INT8.to(TFLOPs / second).magnitude
 a100_bw_tbs_value = A100_MEM_BW.to(TB / second).magnitude
 a100_int8_speedup_value = int(a100_tflops_int8_value / a100_tflops_fp16_value)
-# --- Inputs (energy/perf illustrative values) ---
+    # ┌── 1. PARAMETERS (Inputs) ──────────────────────────────────────────────
-int8_energy_reduction_value = 20
+    # Illustrative energy/perf values
-mobilenet_int8_mj_value = 47
+    int8_energy_reduction = 20
-mobilenet_fp32_mj_value = 312
+    mobilenet_int8_mj = 47
-tpu_v4_tops_per_w_value = 0.9
+    mobilenet_fp32_mj = 312
-v100_tops_per_w_value = 0.3
+    tpu_v4_tops_per_w = 0.9
-bandwidth_bound_speedup_value = 4
+    v100_tops_per_w = 0.3
    bandwidth_bound_speedup = 4
    llm_7b_params = 7
    gpt3_training_flops_exp = 23
-# --- Inputs (energy: multiply-add operations from constants) ---
+    # ┌── 2. CALCULATION (The Physics) ────────────────────────────────────────
-energy_dram_value = ENERGY_DRAM_ACCESS_PJ.magnitude
+    # A100 specs
-energy_dram_per_byte_value = ENERGY_DRAM_PJ_PER_BYTE.magnitude
+    a100_tflops_fp16 = A100_FLOPS_FP16_TENSOR.m_as(TFLOPs / second)
-energy_flop_fp32_value = ENERGY_FLOP_FP32_PJ.magnitude
+    a100_tflops_int8 = A100_FLOPS_INT8.m_as(TFLOPs / second)
-energy_flop_int8_value = ENERGY_FLOP_INT8_PJ.magnitude
+    a100_bw_tbs = A100_MEM_BW.m_as(TB / second)
    a100_int8_speedup = int(a100_tflops_int8 / a100_tflops_fp16)
-# Energy for addition operations (Horowitz 2014, 45nm process)
+    # Energy from constants (Horowitz 2014, 45nm process)
-energy_add_fp32_pj_value = ENERGY_ADD_FP32_PJ.to(ureg.picojoule).magnitude
+    energy_dram = ENERGY_DRAM_ACCESS_PJ.m_as(ureg.picojoule)
-energy_add_fp16_pj_value = ENERGY_ADD_FP16_PJ.to(ureg.picojoule).magnitude
+    energy_dram_per_byte = ENERGY_DRAM_PJ_PER_BYTE.m_as(ureg.picojoule / ureg.byte)
-energy_add_int32_pj_value = ENERGY_ADD_INT32_PJ.to(ureg.picojoule).magnitude
+    energy_flop_fp32 = ENERGY_FLOP_FP32_PJ.m_as(ureg.picojoule / ureg.count)
-energy_add_int8_pj_value = ENERGY_ADD_INT8_PJ.to(ureg.picojoule).magnitude
+    energy_flop_int8 = ENERGY_FLOP_INT8_PJ.m_as(ureg.picojoule / ureg.count)
-energy_mul_fp32_pj_value = ENERGY_FLOP_FP32_PJ.magnitude
+    energy_add_fp32_pj = ENERGY_ADD_FP32_PJ.m_as(ureg.picojoule)
    energy_add_fp16_pj = ENERGY_ADD_FP16_PJ.m_as(ureg.picojoule)
    energy_add_int32_pj = ENERGY_ADD_INT32_PJ.m_as(ureg.picojoule)
    energy_add_int8_pj = ENERGY_ADD_INT8_PJ.m_as(ureg.picojoule)
    energy_mul_fp32_pj = ENERGY_FLOP_FP32_PJ.m_as(ureg.picojoule / ureg.count)
-# INT8 vs FP32 energy ratio (MAC-to-MAC: multiply + add for each precision)
+    # INT8 vs FP32 MAC energy ratio
-fp32_mac_pj_value = energy_mul_fp32_pj_value + energy_add_fp32_pj_value  # 3.7 + 0.9 = 4.6 pJ
+    fp32_mac_pj = energy_mul_fp32_pj + energy_add_fp32_pj   # 3.7 + 0.9 = 4.6 pJ
-int8_mac_pj_value = energy_flop_int8_value + energy_add_int8_pj_value    # 0.2 + 0.03 = 0.23 pJ
+    int8_mac_pj = energy_flop_int8 + energy_add_int8_pj     # 0.2 + 0.03 = 0.23 pJ
-int8_fp32_energy_ratio_value = fp32_mac_pj_value / int8_mac_pj_value
+    int8_fp32_energy_ratio = fp32_mac_pj / int8_mac_pj
-# V100 specs
+    # V100 specs
-v100_bw_gbs_value = V100_MEM_BW.to(GB / second).magnitude
+    v100_bw_gbs = V100_MEM_BW.m_as(GB / second)
-v100_tflops_fp32_value = V100_FLOPS_FP32.to(TFLOPs / second).magnitude
+    v100_tflops_fp32 = V100_FLOPS_FP32.m_as(TFLOPs / second)
-# Model specs
+    # Model specs
-resnet_params_m_value = RESNET50_PARAMS.to(Mparam).magnitude
+    resnet_params_m = RESNET50_PARAMS.m_as(Mparam)
-resnet_gflops_value = RESNET50_FLOPs.to(GFLOPs).magnitude
+    resnet_gflops = RESNET50_FLOPs.m_as(GFLOPs)
-mobilenetv2_mflops_value = MOBILENETV2_FLOPs.to(GFLOPs).magnitude * 1000
+    mobilenetv2_mflops = MOBILENETV2_FLOPs.m_as(GFLOPs) * 1000
-# LLM parameter/memory calculations
+    # LLM memory
-llm_7b_params_value = 7
+    llm_7b_mem_fp16_gb = llm_7b_params * 2
-llm_7b_mem_fp16_gb_value = llm_7b_params_value * 2
+    llm_175b_params = GPT3_PARAMS.m_as(Bparam)
-llm_175b_params_value = GPT3_PARAMS.to(Bparam).magnitude
+    llm_175b_mem_fp16_gb = llm_175b_params * 2
 llm_175b_mem_fp16_gb_value = llm_175b_params_value * 2
-# Device memory constraints
+    # Device memory
-smartphone_ram_gb_value = SMARTPHONE_RAM_GB.to(GB).magnitude
+    smartphone_ram_gb = SMARTPHONE_RAM_GB.m_as(GB)
-mcu_ram_kb_value = MCU_RAM_KIB.to(KiB).magnitude
+    mcu_ram_kb = MCU_RAM_KIB.m_as(KiB)
-# GPT-3 training FLOPs
+    # ┌── 3. INVARIANTS (Guardrails) ──────────────────────────────────────────
-gpt3_training_flops_exp_value = 23
+    check(a100_int8_speedup >= 2, "A100 INT8 should be at least 2x faster than FP16.")
    check(int8_fp32_energy_ratio > 1, "FP32 MAC must cost more energy than INT8 MAC.")
-# --- Outputs (formatted strings for prose) ---
+    # ┌── 4. OUTPUTS (Formatting) ─────────────────────────────────────────────
-a100_tflops_fp16_str = fmt(a100_tflops_fp16_value, precision=0, commas=False)
+    a100_tflops_fp16_str = fmt(a100_tflops_fp16, precision=0, commas=False)
-a100_tflops_int8_str = fmt(a100_tflops_int8_value, precision=0, commas=False)
+    a100_tflops_int8_str = fmt(a100_tflops_int8, precision=0, commas=False)
-a100_bw_tbs_str = fmt(a100_bw_tbs_value, precision=1, commas=False)
+    a100_bw_tbs_str = fmt(a100_bw_tbs, precision=1, commas=False)
-a100_int8_speedup_str = fmt(a100_int8_speedup_value, precision=0, commas=False)
+    a100_int8_speedup_str = fmt(a100_int8_speedup, precision=0, commas=False)
-int8_energy_reduction_str = fmt(int8_energy_reduction_value, precision=0, commas=False)
+    int8_energy_reduction_str = fmt(int8_energy_reduction, precision=0, commas=False)
-mobilenet_int8_mj_str = fmt(mobilenet_int8_mj_value, precision=0, commas=False)
+    mobilenet_int8_mj_str = fmt(mobilenet_int8_mj, precision=0, commas=False)
-mobilenet_fp32_mj_str = fmt(mobilenet_fp32_mj_value, precision=0, commas=False)
+    mobilenet_fp32_mj_str = fmt(mobilenet_fp32_mj, precision=0, commas=False)
-tpu_v4_tops_per_w_str = fmt(tpu_v4_tops_per_w_value, precision=1, commas=False)
+    tpu_v4_tops_per_w_str = fmt(tpu_v4_tops_per_w, precision=1, commas=False)
-v100_tops_per_w_str = fmt(v100_tops_per_w_value, precision=1, commas=False)
+    v100_tops_per_w_str = fmt(v100_tops_per_w, precision=1, commas=False)
-bandwidth_bound_speedup_str = fmt(bandwidth_bound_speedup_value, precision=0, commas=False)
+    bandwidth_bound_speedup_str = fmt(bandwidth_bound_speedup, precision=0, commas=False)
    energy_dram_str = fmt(energy_dram, precision=0, commas=False)
    energy_dram_per_byte_str = fmt(energy_dram_per_byte, precision=0, commas=False)
    energy_flop_fp32_str = f"{energy_flop_fp32}"
    energy_flop_int8_str = f"{energy_flop_int8}"
    energy_add_fp32_str = f"{energy_add_fp32_pj}"
    energy_add_fp16_str = f"{energy_add_fp16_pj}"
    energy_add_int32_str = f"{energy_add_int32_pj}"
    energy_add_int8_str = f"{energy_add_int8_pj}"
    energy_mul_fp32_str = f"{energy_mul_fp32_pj}"
    int8_fp32_energy_ratio_str = fmt(int8_fp32_energy_ratio, precision=1, commas=False)
    v100_bw_gbs_str = fmt(v100_bw_gbs, precision=0, commas=False)
    v100_tflops_fp32_str = fmt(v100_tflops_fp32, precision=1, commas=False)
    resnet_params_m_str = fmt(resnet_params_m, precision=1, commas=False)
    resnet_gflops_str = fmt(resnet_gflops, precision=1, commas=False)
    mobilenetv2_mflops_str = fmt(mobilenetv2_mflops, precision=0, commas=False)
    llm_7b_str = f"{llm_7b_params}"
    llm_7b_mem_str = fmt(llm_7b_mem_fp16_gb, precision=0, commas=False)
    llm_175b_str = fmt(llm_175b_params, precision=0, commas=False)
    llm_175b_mem_str = fmt(llm_175b_mem_fp16_gb, precision=0, commas=False)
    smartphone_ram_str = f"{smartphone_ram_gb}"
    mcu_ram_str = f"{mcu_ram_kb}"
    gpt3_training_flops_str = f"$3.14 \\times 10^{{{gpt3_training_flops_exp}}}$"
-energy_dram_str = fmt(energy_dram_value, precision=0, commas=False)
+# ┌── EXPORTS (Bridge to Text) ─────────────────────────────────────────────────
-energy_dram_per_byte_str = fmt(energy_dram_per_byte_value, precision=0, commas=False)
+a100_tflops_fp16_str = CompressionSetup.a100_tflops_fp16_str
-energy_flop_fp32_str = f"{energy_flop_fp32_value}"
+a100_tflops_int8_str = CompressionSetup.a100_tflops_int8_str
-energy_flop_int8_str = f"{energy_flop_int8_value}"
+a100_bw_tbs_str = CompressionSetup.a100_bw_tbs_str
-
+a100_int8_speedup_str = CompressionSetup.a100_int8_speedup_str
-energy_add_fp32_str = f"{energy_add_fp32_pj_value}"
+int8_energy_reduction_str = CompressionSetup.int8_energy_reduction_str
-energy_add_fp16_str = f"{energy_add_fp16_pj_value}"
+mobilenet_int8_mj_str = CompressionSetup.mobilenet_int8_mj_str
-energy_add_int32_str = f"{energy_add_int32_pj_value}"
+mobilenet_fp32_mj_str = CompressionSetup.mobilenet_fp32_mj_str
-energy_add_int8_str = f"{energy_add_int8_pj_value}"
+tpu_v4_tops_per_w_str = CompressionSetup.tpu_v4_tops_per_w_str
-energy_mul_fp32_str = f"{energy_mul_fp32_pj_value}"
+v100_tops_per_w_str = CompressionSetup.v100_tops_per_w_str
-
+bandwidth_bound_speedup_str = CompressionSetup.bandwidth_bound_speedup_str
-int8_fp32_energy_ratio_str = fmt(int8_fp32_energy_ratio_value, precision=1, commas=False)
+energy_dram_str = CompressionSetup.energy_dram_str
-
+energy_dram_per_byte_str = CompressionSetup.energy_dram_per_byte_str
-v100_bw_gbs_str = fmt(v100_bw_gbs_value, precision=0, commas=False)
+energy_flop_fp32_str = CompressionSetup.energy_flop_fp32_str
-v100_tflops_fp32_str = fmt(v100_tflops_fp32_value, precision=1, commas=False)
+energy_flop_int8_str = CompressionSetup.energy_flop_int8_str
-
+energy_add_fp32_str = CompressionSetup.energy_add_fp32_str
-resnet_params_m_str = fmt(resnet_params_m_value, precision=1, commas=False)
+energy_add_fp16_str = CompressionSetup.energy_add_fp16_str
-resnet_gflops_str = fmt(resnet_gflops_value, precision=1, commas=False)
+energy_add_int32_str = CompressionSetup.energy_add_int32_str
-mobilenetv2_mflops_str = fmt(mobilenetv2_mflops_value, precision=0, commas=False)
+energy_add_int8_str = CompressionSetup.energy_add_int8_str
-
+energy_mul_fp32_str = CompressionSetup.energy_mul_fp32_str
-llm_7b_str = f"{llm_7b_params_value}"
+int8_fp32_energy_ratio_str = CompressionSetup.int8_fp32_energy_ratio_str
-llm_7b_mem_str = fmt(llm_7b_mem_fp16_gb_value, precision=0, commas=False)
+v100_bw_gbs_str = CompressionSetup.v100_bw_gbs_str
-llm_175b_str = fmt(llm_175b_params_value, precision=0, commas=False)
+v100_tflops_fp32_str = CompressionSetup.v100_tflops_fp32_str
-llm_175b_mem_str = fmt(llm_175b_mem_fp16_gb_value, precision=0, commas=False)
+resnet_params_m_str = CompressionSetup.resnet_params_m_str
-smartphone_ram_str = f"{smartphone_ram_gb_value}"
+resnet_gflops_str = CompressionSetup.resnet_gflops_str
-mcu_ram_str = f"{mcu_ram_kb_value}"
+mobilenetv2_mflops_str = CompressionSetup.mobilenetv2_mflops_str
-gpt3_training_flops_str = f"$3.14 \\times 10^{{{gpt3_training_flops_exp_value}}}$"
+llm_7b_str = CompressionSetup.llm_7b_str
 llm_7b_mem_str = CompressionSetup.llm_7b_mem_str
 llm_175b_str = CompressionSetup.llm_175b_str
 llm_175b_mem_str = CompressionSetup.llm_175b_mem_str
 smartphone_ram_str = CompressionSetup.smartphone_ram_str
 mcu_ram_str = CompressionSetup.mcu_ram_str
 gpt3_training_flops_str = CompressionSetup.gpt3_training_flops_str
 # Note: v100_bw_gbs_value used by downstream fusion-calc cell
 v100_bw_gbs_value = CompressionSetup.v100_bw_gbs
 v100_tflops_fp32_value = CompressionSetup.v100_tflops_fp32
 ```
 ## Optimization Framework {#sec-model-compression-optimization-framework-9e21}
 A `{python} llm_7b_str`-billion parameter language model requires `{python} llm_7b_mem_str` GB just to store its weights in FP16. Your deployment target is a smartphone with `{python} smartphone_ram_str` GB of RAM shared across the operating system, applications, and your model. *The math does not work.* No amount of clever engineering changes this arithmetic: `{python} llm_7b_mem_str` GB cannot fit in `{python} smartphone_ram_str` GB. Yet users expect the model to run: responsively, offline, without draining their battery in an hour. The gap between what training produces and what deployment permits (the Latency Budget, the maximum allowable end-to-end inference time, defined formally in @sec-model-serving) is not a minor inconvenience but a defining challenge of model compression.
@@ -420,7 +454,6 @@ We call this phenomenon *the quantization speedup*.
 The relative importance of each dimension varies by deployment target. Cloud systems may tolerate larger models but demand throughput; mobile devices prioritize memory and energy; embedded systems face hard constraints on all resources simultaneously. Understanding these deployment contexts shapes which optimization dimensions to prioritize.
 ## Deployment Context {#sec-model-compression-deployment-context-0d88}
 The optimization framework above identifies three dimensions of compression, but which dimensions matter most depends entirely on where the model will run. A datacenter GPU with 80 GB of HBM faces different binding constraints than a smartphone with shared RAM or a microcontroller with 256 KB of SRAM. @tbl-deployment-scenarios summarizes the key constraints across deployment environments.
@@ -482,55 +515,80 @@ from mlsys.constants import (GB, GiB, MiB, KiB, MB, KB, byte,
                             CLOUD_MEM_GIB, MOBILE_MEM_GIB, TINY_MEM_KIB,
                             DLRM_MODEL_SIZE_FP32)
-# --- Inputs (device capacities and model sizes) ---
+def _get_ratio(model_mem, device_mem):
-cloud_mem_value = CLOUD_MEM_GIB
+    """Return 'ok' if model fits, else 'no (Nx)' with how many times it overflows."""
-mobile_mem_value = MOBILE_MEM_GIB
+    ratio = model_mem.m_as(byte) / device_mem.m_as(byte)
 tiny_mem_value = TINY_MEM_KIB
 dlrm_mem_value = DLRM_MODEL_SIZE_FP32
 gpt2_mem_value = 6 * GiB
 resnet_mem_value = 100 * MiB
 mobilenet_mem_value = 14 * MiB
 mobilenet_int8_mem_value = 3.5 * MiB
 dscnn_mem_value = 500 * KiB
 # --- Process (compute fit ratios) ---
 def get_ratio(model_mem, device_mem):
    ratio = model_mem.to(byte).magnitude / device_mem.to(byte).magnitude
    if ratio < 1:
        return "ok"
    return f"no ({ratio:.0f}x)"
-dlrm_mobile_value = get_ratio(dlrm_mem_value, mobile_mem_value)
+class ModelDeviceComparison:
-dlrm_tiny_value = get_ratio(dlrm_mem_value, tiny_mem_value)
+    """Contrast model requirements with device memory: 6-order-of-magnitude deployment gap."""
-gpt2_mobile_value = get_ratio(gpt2_mem_value, mobile_mem_value)
+    # ┌── 1. PARAMETERS (Inputs) ──────────────────────────────────────────────
-gpt2_tiny_value = get_ratio(gpt2_mem_value, tiny_mem_value)
+    # Device capacities
    cloud_mem = CLOUD_MEM_GIB
    mobile_mem = MOBILE_MEM_GIB
    tiny_mem = TINY_MEM_KIB
-resnet_tiny_value = get_ratio(resnet_mem_value, tiny_mem_value)
+    # Model sizes
-mobilenet_tiny_value = get_ratio(mobilenet_mem_value, tiny_mem_value)
+    dlrm_mem = DLRM_MODEL_SIZE_FP32
-mobilenet_int8_tiny_value = get_ratio(mobilenet_int8_mem_value, tiny_mem_value)
+    gpt2_mem = 6 * GiB
    resnet_mem = 100 * MiB
    mobilenet_mem = 14 * MiB
    mobilenet_int8_mem = 3.5 * MiB
    dscnn_mem = 500 * KiB
-# --- Outputs (formatted strings for prose) ---
+    # ┌── 2. CALCULATION (The Physics) ────────────────────────────────────────
-dlrm_str = f"{dlrm_mem_value.to(GB).magnitude:.0f} GB"
+    dlrm_mobile = _get_ratio(dlrm_mem, mobile_mem)
-gpt2_str = f"{gpt2_mem_value.to(GiB).magnitude:.0f} GB"
+    dlrm_tiny = _get_ratio(dlrm_mem, tiny_mem)
-resnet_str = f"{resnet_mem_value.to(MiB).magnitude:.0f} MB"
+    gpt2_mobile = _get_ratio(gpt2_mem, mobile_mem)
-mobilenet_str = f"{mobilenet_mem_value.to(MiB).magnitude:.0f} MB"
+    gpt2_tiny = _get_ratio(gpt2_mem, tiny_mem)
-mobilenet_int8_str = f"{mobilenet_int8_mem_value.to(MiB).magnitude:.1f} MB"
+    resnet_tiny = _get_ratio(resnet_mem, tiny_mem)
-dscnn_str = f"{dscnn_mem_value.to(KiB).magnitude:.0f} KB"
+    mobilenet_tiny = _get_ratio(mobilenet_mem, tiny_mem)
    mobilenet_int8_tiny = _get_ratio(mobilenet_int8_mem, tiny_mem)
-cloud_cap_str = f"~{cloud_mem_value.to(GiB).magnitude:.0f} GB"
+    # ┌── 3. INVARIANTS (Guardrails) ──────────────────────────────────────────
-mobile_cap_str = f"~{mobile_mem_value.to(GiB).magnitude:.0f} GB"
+    # DS-CNN always fits TinyML — sanity check
-tiny_cap_str = f"~{tiny_mem_value.to(KiB).magnitude:.0f} KB"
+    assert _get_ratio(dscnn_mem, tiny_mem) == "ok", "DS-CNN must fit in TinyML device."
-dlrm_mobile_str = dlrm_mobile_value
+    # ┌── 4. OUTPUTS (Formatting) ─────────────────────────────────────────────
-dlrm_tiny_str = dlrm_tiny_value
+    dlrm_str = f"{dlrm_mem.m_as(GB):.0f} GB"
-gpt2_mobile_str = gpt2_mobile_value
+    gpt2_str = f"{gpt2_mem.m_as(GiB):.0f} GB"
-gpt2_tiny_str = gpt2_tiny_value
+    resnet_str = f"{resnet_mem.m_as(MiB):.0f} MB"
-resnet_tiny_str = resnet_tiny_value
+    mobilenet_str = f"{mobilenet_mem.m_as(MiB):.0f} MB"
-mobilenet_tiny_str = mobilenet_tiny_value
+    mobilenet_int8_str = f"{mobilenet_int8_mem.m_as(MiB):.1f} MB"
-mobilenet_int8_tiny_str = mobilenet_int8_tiny_value
+    dscnn_str = f"{dscnn_mem.m_as(KiB):.0f} KB"
-dscnn_tiny_str = "ok"
+    cloud_cap_str = f"~{cloud_mem.m_as(GiB):.0f} GB"
    mobile_cap_str = f"~{mobile_mem.m_as(GiB):.0f} GB"
    tiny_cap_str = f"~{tiny_mem.m_as(KiB):.0f} KB"
    dlrm_mobile_str = dlrm_mobile
    dlrm_tiny_str = dlrm_tiny
    gpt2_mobile_str = gpt2_mobile
    gpt2_tiny_str = gpt2_tiny
    resnet_tiny_str = resnet_tiny
    mobilenet_tiny_str = mobilenet_tiny
    mobilenet_int8_tiny_str = mobilenet_int8_tiny
    dscnn_tiny_str = "ok"
 # ┌── EXPORTS (Bridge to Text) ─────────────────────────────────────────────────
 dlrm_str = ModelDeviceComparison.dlrm_str
 gpt2_str = ModelDeviceComparison.gpt2_str
 resnet_str = ModelDeviceComparison.resnet_str
 mobilenet_str = ModelDeviceComparison.mobilenet_str
 mobilenet_int8_str = ModelDeviceComparison.mobilenet_int8_str
 dscnn_str = ModelDeviceComparison.dscnn_str
 cloud_cap_str = ModelDeviceComparison.cloud_cap_str
 mobile_cap_str = ModelDeviceComparison.mobile_cap_str
 tiny_cap_str = ModelDeviceComparison.tiny_cap_str
 dlrm_mobile_str = ModelDeviceComparison.dlrm_mobile_str
 dlrm_tiny_str = ModelDeviceComparison.dlrm_tiny_str
 gpt2_mobile_str = ModelDeviceComparison.gpt2_mobile_str
 gpt2_tiny_str = ModelDeviceComparison.gpt2_tiny_str
 resnet_tiny_str = ModelDeviceComparison.resnet_tiny_str
 mobilenet_tiny_str = ModelDeviceComparison.mobilenet_tiny_str
 mobilenet_int8_tiny_str = ModelDeviceComparison.mobilenet_int8_tiny_str
 dscnn_tiny_str = ModelDeviceComparison.dscnn_tiny_str
 ```
 | **Model**              | **Memory** **(Runtime)**      | **Storage** **(Weights)**     | **Cloud** **(`{python} cloud_cap_str`)** | **Mobile** **(`{python} mobile_cap_str`)** | **TinyML** **(`{python} tiny_cap_str`)** |
@@ -600,7 +658,6 @@ Optimization is about trading one resource for another.
 Each deployment context above imposes a binding constraint: memory capacity on mobile devices, latency on real-time systems, energy on battery-powered sensors. The optimization techniques that follow address these constraints at three successive levels of the stack. We begin with structural methods that modify *what* computations occur, reducing the model's parameter count and operation count to fit tighter memory and compute budgets. We then turn to precision techniques that reduce how many bits represent each value, directly shrinking memory footprint and accelerating arithmetic. Finally, we address architectural approaches that improve how efficiently the remaining operations execute on physical hardware, closing the gap between theoretical savings and measured performance.
 ## Structural Optimization {#sec-model-compression-structural-optimization-ee93}
 \index{Model Compression!structural optimization}
@@ -2764,7 +2821,6 @@ Test your understanding of the structural optimization techniques covered so far
 - [ ] Can you identify when to choose Neural Architecture Search over manual architecture design? Consider the trade-offs in computational cost, design space coverage, and hardware-specific optimization.
 :::
 ## Quantization and Precision {#sec-model-compression-quantization-precision-cd46}
 \index{Model Compression!precision optimization}
@@ -3690,44 +3746,57 @@ Compare the two mapping diagrams side by side in @fig-calibration-ranges. Symmet
 # │          zero_point_str, x_val_str, x_q_str, x_recon_str
 # └─────────────────────────────────────────────────────────────────────────────
 from mlsys.formatting import fmt, check
 from mlsys.constants import KIB_TO_BYTES
-# --- Inputs (activation range example) ---
+class QuantizationMathCalc:
-alpha_value = -1.0
+    """Derive affine quantization parameters: scale and zero-point for [-1.0, 3.0] → UINT8."""
 beta_value = 3.0
 bits_value = 8
 x_val_value = 0.0  # value to quantize
-# --- Process (calculate affine parameters) ---
+    # ┌── 1. PARAMETERS (Inputs) ──────────────────────────────────────────────
-# 1. Calculate Scale (s)
+    alpha = -1.0                                                # activation range min
-#    s = (beta - alpha) / (2^b - 1)
+    beta = 3.0                                                  # activation range max
-int_steps_value = 2**bits_value - 1
+    bits = 8                                                    # target bit-width
-scale_value = (beta_value - alpha_value) / int_steps_value
+    x_val = 0.0                                                 # value to quantize
-# 2. Calculate Zero-Point (z)
+    # ┌── 2. CALCULATION (The Physics) ────────────────────────────────────────
-#    z = round(-alpha / s)
+    # 1. Scale: s = (beta - alpha) / (2^b - 1)
-#    Note: z maps the real value 0.0 to an integer
+    int_steps = 2**bits - 1
-zero_point_value = round(-alpha_value / scale_value)
+    scale = (beta - alpha) / int_steps
-# 3. Quantize a value
+    # 2. Zero-point: z = round(-alpha / s)
-#    x_q = clamp(round(x / s) + z, 0, 2^b - 1)
+    zero_point = round(-alpha / scale)
 x_q_raw = round(x_val_value / scale_value) + zero_point_value
 x_q_value = max(0, min(int_steps_value, x_q_raw))
-# 4. Dequantize (reconstruct)
+    # 3. Quantize: x_q = clamp(round(x/s) + z, 0, 2^b - 1)
-#    x_recon = (x_q - z) * s
+    x_q_raw = round(x_val / scale) + zero_point
-x_recon_value = (x_q_value - zero_point_value) * scale_value
+    x_q = max(0, min(int_steps, x_q_raw))
-# --- Outputs (formatted strings for prose) ---
+    # 4. Dequantize: x_recon = (x_q - z) * s
-alpha_str = fmt(alpha_value, precision=1, commas=False)       # "-1.0"
+    x_recon = (x_q - zero_point) * scale
-beta_str = fmt(beta_value, precision=1, commas=False)         # "3.0"
+
-range_str = fmt(beta_value - alpha_value, precision=1, commas=False) # "4.0"
+    # ┌── 3. INVARIANTS (Guardrails) ──────────────────────────────────────────
-steps_str = f"{int_steps_value}"                              # "255"
+    check(scale > 0, "Scale must be positive.")
-scale_str = fmt(scale_value, precision=4, commas=False)       # "0.0157"
+    check(0 <= zero_point <= int_steps, "Zero-point must be in valid integer range.")
-zero_point_str = f"{int(zero_point_value)}"                   # "64"
+    check(abs(x_recon - x_val) < scale, "Reconstruction error must be less than one step size.")
-x_val_str = fmt(x_val_value, precision=1, commas=False)       # "0.0"
+
-x_q_str = f"{int(x_q_value)}"                                 # "64"
+    # ┌── 4. OUTPUTS (Formatting) ─────────────────────────────────────────────
-x_recon_str = fmt(x_recon_value, precision=2, commas=False)   # "0.00"
+    alpha_str = fmt(alpha, precision=1, commas=False)           # "-1.0"
    beta_str = fmt(beta, precision=1, commas=False)             # "3.0"
    range_str = fmt(beta - alpha, precision=1, commas=False)    # "4.0"
    steps_str = f"{int_steps}"                                  # "255"
    scale_str = fmt(scale, precision=4, commas=False)           # "0.0157"
    zero_point_str = f"{int(zero_point)}"                       # "64"
    x_val_str = fmt(x_val, precision=1, commas=False)           # "0.0"
    x_q_str = f"{int(x_q)}"                                    # "64"
    x_recon_str = fmt(x_recon, precision=2, commas=False)       # "0.00"
 # ┌── EXPORTS (Bridge to Text) ─────────────────────────────────────────────────
 alpha_str = QuantizationMathCalc.alpha_str
 beta_str = QuantizationMathCalc.beta_str
 range_str = QuantizationMathCalc.range_str
 steps_str = QuantizationMathCalc.steps_str
 scale_str = QuantizationMathCalc.scale_str
 zero_point_str = QuantizationMathCalc.zero_point_str
 x_val_str = QuantizationMathCalc.x_val_str
 x_q_str = QuantizationMathCalc.x_q_str
 x_recon_str = QuantizationMathCalc.x_recon_str
 ```
 ::: {.callout-notebook title="Calculating Scale and Zero-Point"}
@@ -4326,7 +4395,6 @@ Yet practitioners often discover a frustrating gap between theory and practice:
 The gap arises from several sources. Sparse matrices stored in dense format waste memory bandwidth loading zeros—the hardware cannot skip what it does not know is zero. Operations that could run in parallel execute sequentially due to data dependencies the compiler cannot resolve. Simple inputs receive the same computational budget as complex ones because the model has no mechanism to exit early. Closing the gap between "optimized on paper" and "optimized in practice" is the domain of our third optimization dimension: **architectural efficiency**. This dimension ensures that structural and precision optimizations translate into real-world speedups by aligning computation patterns with hardware capabilities.
 ## Architectural Efficiency {#sec-model-compression-architectural-efficiency-8dd3}
 Architectural efficiency optimization ensures that computations execute efficiently on target hardware by aligning model operations with processor capabilities and memory hierarchies. Where representation optimization determines *what* computations to perform and precision optimization determines *how precisely* to compute, architectural efficiency addresses *how* operations are scheduled, memory is accessed, and workloads adapt to input characteristics. This third dimension closes the gap between theoretical compression ratios and real-world speedups.
@@ -4452,77 +4520,102 @@ Beyond reducing what data must be stored, substantial efficiency gains emerge fr
 # │          kernels_fused_str, saved_latency_ms_str
 # └─────────────────────────────────────────────────────────────────────────────
 from mlsys.formatting import fmt, check
-from mlsys.constants import KIB_TO_BYTES
+from mlsys.constants import KIB_TO_BYTES, MILLION
-# --- Inputs (Conv-BN-ReLU) ---
+class FusionCalc:
-conv_channels_value = 256
+    """Quantify latency and bandwidth benefits of Conv-BN-ReLU operator fusion on ResNet-50."""
 conv_spatial_value = 28
 bytes_per_element_value = 4
-# GEMM
+    # ┌── 1. PARAMETERS (Inputs) ──────────────────────────────────────────────
-gemm_hidden_value = 768
+    # Conv-BN-ReLU layer geometry
-gemm_seq_value = 512
+    conv_channels = 256
    conv_spatial = 28
    bytes_per_element = 4                                        # FP32
-# Memory Bandwidth Analysis (ResNet-50 layer)
+    # GEMM geometry
-# Feature map: 256 channels × 28 × 28 spatial × 4 bytes/element (FP32)
+    gemm_hidden = 768
-feat_map_mb_value = conv_channels_value * conv_spatial_value * conv_spatial_value * bytes_per_element_value / MILLION  # SI MB
+    gemm_seq = 512
 weights_mb_value = 2.4
 bn_params_mb_value = 0.002
-# Kernel Launch
+    # ResNet-50 layer memory baseline
-kernels_unfused_value = 159
+    weights_mb = 2.4
-kernels_fused_value = 53
+    bn_params_mb = 0.002
 latency_per_kernel_us_value = 10
-# --- Process ---
+    # Kernel launch overhead
-# Conv-BN-ReLU intermediate
+    kernels_unfused = 159
-conv_bn_relu_intermediate_bytes = 2 * conv_channels_value * conv_spatial_value * conv_spatial_value * bytes_per_element_value
+    kernels_fused = 53
-conv_bn_relu_intermediate_mb_value = conv_bn_relu_intermediate_bytes / (1024**2)
+    latency_per_kernel_us = 10
-# GEMM intermediate
+    # ┌── 2. CALCULATION (The Physics) ────────────────────────────────────────
-gemm_intermediate_bytes = gemm_hidden_value * gemm_seq_value * bytes_per_element_value
+    # Feature map size (SI MB)
-gemm_intermediate_mb_value = gemm_intermediate_bytes / (1024**2)
+    feat_map_mb = conv_channels * conv_spatial * conv_spatial * bytes_per_element / MILLION
-# Bandwidth Analysis
+    # Conv-BN-ReLU intermediate (2 feature maps written: conv→BN boundary)
-unfused_conv_mb_value = feat_map_mb_value * 2 + weights_mb_value
+    conv_bn_relu_intermediate_mb = (
-unfused_bn_mb_value = feat_map_mb_value * 2 + bn_params_mb_value
+        2 * conv_channels * conv_spatial * conv_spatial * bytes_per_element / (1024**2)
-unfused_relu_mb_value = feat_map_mb_value * 2
+    )
 total_unfused_mb_value = unfused_conv_mb_value + unfused_bn_mb_value + unfused_relu_mb_value
-total_fused_mb_value = feat_map_mb_value * 2 + weights_mb_value
+    # GEMM intermediate
-bandwidth_reduction_pct_value = (1 - total_fused_mb_value / total_unfused_mb_value) * 100
+    gemm_intermediate_mb = gemm_hidden * gemm_seq * bytes_per_element / (1024**2)
-# Kernel Launch
+    # Unfused bandwidth: Conv (feat*2 + weights) + BN (feat*2 + bn) + ReLU (feat*2)
-saved_latency_us_value = (kernels_unfused_value - kernels_fused_value) * latency_per_kernel_us_value
+    unfused_conv_mb = feat_map_mb * 2 + weights_mb
-saved_latency_ms_value = saved_latency_us_value / 1000
+    unfused_bn_mb = feat_map_mb * 2 + bn_params_mb
    unfused_relu_mb = feat_map_mb * 2
    total_unfused_mb = unfused_conv_mb + unfused_bn_mb + unfused_relu_mb
-# V100 timing analysis (memory-bound)
+    # Fused bandwidth: read input + weights once, write output once
-v100_bw_gbs_local_value = v100_bw_gbs_value  # from earlier cell
+    total_fused_mb = feat_map_mb * 2 + weights_mb
-unfused_time_us_value = total_unfused_mb_value / v100_bw_gbs_local_value * 1000  # MB / (GB/s) * 1000 = us
+    bandwidth_reduction_pct = (1 - total_fused_mb / total_unfused_mb) * 100
 fused_time_us_value = total_fused_mb_value / v100_bw_gbs_local_value * 1000
 fusion_speedup_value = unfused_time_us_value / fused_time_us_value
-# --- Outputs (formatted strings for prose) ---
+    # Kernel launch savings
-conv_bn_relu_intermediate_mb_str = fmt(conv_bn_relu_intermediate_mb_value, precision=1, commas=False)
+    saved_latency_us = (kernels_unfused - kernels_fused) * latency_per_kernel_us
-gemm_intermediate_mb_str = fmt(gemm_intermediate_mb_value, precision=1, commas=False)
+    saved_latency_ms = saved_latency_us / 1000
-feat_map_kb_str = fmt(feat_map_mb_value * 1000, precision=0, commas=False)
+    # V100 timing (memory-bound): MB / (GB/s) * 1000 = µs
-weights_mb_str = fmt(weights_mb_value, precision=1, commas=False)
+    unfused_time_us = total_unfused_mb / v100_bw_gbs_value * 1000
-bn_params_kb_str = fmt(bn_params_mb_value * KIB_TO_BYTES, precision=0, commas=False)
+    fused_time_us = total_fused_mb / v100_bw_gbs_value * 1000
    fusion_speedup = unfused_time_us / fused_time_us
-unfused_conv_mb_str = fmt(unfused_conv_mb_value, precision=1, commas=False)
+    # ┌── 3. INVARIANTS (Guardrails) ──────────────────────────────────────────
-unfused_bn_mb_str = fmt(unfused_bn_mb_value, precision=1, commas=False)
+    check(bandwidth_reduction_pct > 40, "Fusion should reduce bandwidth by more than 40%.")
-unfused_relu_mb_str = fmt(unfused_relu_mb_value, precision=1, commas=False)
+    check(fusion_speedup > 1, "Fused execution must be faster than unfused.")
 total_unfused_mb_str = fmt(total_unfused_mb_value, precision=1, commas=False)
 total_fused_mb_str = fmt(total_fused_mb_value, precision=1, commas=False)
 bandwidth_reduction_pct_str = fmt(bandwidth_reduction_pct_value, precision=0, commas=False)
-kernels_unfused_str = fmt(kernels_unfused_value, precision=0, commas=False)
+    # ┌── 4. OUTPUTS (Formatting) ─────────────────────────────────────────────
-kernels_fused_str = fmt(kernels_fused_value, precision=0, commas=False)
+    conv_bn_relu_intermediate_mb_str = fmt(conv_bn_relu_intermediate_mb, precision=1, commas=False)
-saved_latency_ms_str = fmt(saved_latency_ms_value, precision=0, commas=False)
+    gemm_intermediate_mb_str = fmt(gemm_intermediate_mb, precision=1, commas=False)
-unfused_time_us_str = fmt(unfused_time_us_value, precision=0, commas=False)
+    feat_map_kb_str = fmt(feat_map_mb * 1000, precision=0, commas=False)
-fused_time_us_str = fmt(fused_time_us_value, precision=1, commas=False)
+    weights_mb_str = fmt(weights_mb, precision=1, commas=False)
-fusion_speedup_str = fmt(fusion_speedup_value, precision=2, commas=False)
+    bn_params_kb_str = fmt(bn_params_mb * KIB_TO_BYTES, precision=0, commas=False)
    unfused_conv_mb_str = fmt(unfused_conv_mb, precision=1, commas=False)
    unfused_bn_mb_str = fmt(unfused_bn_mb, precision=1, commas=False)
    unfused_relu_mb_str = fmt(unfused_relu_mb, precision=1, commas=False)
    total_unfused_mb_str = fmt(total_unfused_mb, precision=1, commas=False)
    total_fused_mb_str = fmt(total_fused_mb, precision=1, commas=False)
    bandwidth_reduction_pct_str = fmt(bandwidth_reduction_pct, precision=0, commas=False)
    kernels_unfused_str = fmt(kernels_unfused, precision=0, commas=False)
    kernels_fused_str = fmt(kernels_fused, precision=0, commas=False)
    saved_latency_ms_str = fmt(saved_latency_ms, precision=0, commas=False)
    unfused_time_us_str = fmt(unfused_time_us, precision=0, commas=False)
    fused_time_us_str = fmt(fused_time_us, precision=1, commas=False)
    fusion_speedup_str = fmt(fusion_speedup, precision=2, commas=False)
 # ┌── EXPORTS (Bridge to Text) ─────────────────────────────────────────────────
 conv_bn_relu_intermediate_mb_str = FusionCalc.conv_bn_relu_intermediate_mb_str
 gemm_intermediate_mb_str = FusionCalc.gemm_intermediate_mb_str
 feat_map_kb_str = FusionCalc.feat_map_kb_str
 weights_mb_str = FusionCalc.weights_mb_str
 bn_params_kb_str = FusionCalc.bn_params_kb_str
 unfused_conv_mb_str = FusionCalc.unfused_conv_mb_str
 unfused_bn_mb_str = FusionCalc.unfused_bn_mb_str
 unfused_relu_mb_str = FusionCalc.unfused_relu_mb_str
 total_unfused_mb_str = FusionCalc.total_unfused_mb_str
 total_fused_mb_str = FusionCalc.total_fused_mb_str
 bandwidth_reduction_pct_str = FusionCalc.bandwidth_reduction_pct_str
 kernels_unfused_str = FusionCalc.kernels_unfused_str
 kernels_fused_str = FusionCalc.kernels_fused_str
 saved_latency_ms_str = FusionCalc.saved_latency_ms_str
 unfused_time_us_str = FusionCalc.unfused_time_us_str
 fused_time_us_str = FusionCalc.fused_time_us_str
 fusion_speedup_str = FusionCalc.fusion_speedup_str
 ```
 #### Operator Fusion {#sec-model-compression-operator-fusion-ac1d}
@@ -4594,16 +4687,28 @@ def conv_bn_relu_fused(input, weight, gamma, beta, mean, var):
 # └─────────────────────────────────────────────────────────────────────────────
 from mlsys.formatting import fmt, check, md_math
-# --- Inputs (transfer counts) ---
+class ConvFusionCalc:
-unfused_transfers_value = 6  # read/write for each of conv, BN, ReLU
+    """Demonstrate 3x memory traffic reduction from Conv-BN-ReLU fusion (6 transfers → 2)."""
 fused_transfers_value = 2    # read input, write output
-# --- Process ---
+    # ┌── 1. PARAMETERS (Inputs) ──────────────────────────────────────────────
-transfer_reduction_value = unfused_transfers_value / fused_transfers_value
+    unfused_transfers = 6                                        # read/write for Conv, BN, ReLU
    fused_transfers = 2                                          # read input, write output
-# --- Outputs (formatted strings for prose) ---
+    # ┌── 2. CALCULATION (The Physics) ────────────────────────────────────────
-transfer_reduction_str = fmt(transfer_reduction_value, precision=0, commas=False)
+    transfer_reduction = unfused_transfers / fused_transfers
-conv_bn_relu_mem_md = md_math(f"2 \\times 256 \\times 28 \\times 28 \\times 4 \\text{{ bytes}} \\approx \\text{{{conv_bn_relu_intermediate_mb_str} MB}}")
+
    # ┌── 3. INVARIANTS (Guardrails) ──────────────────────────────────────────
    check(transfer_reduction == 3, "Conv-BN-ReLU fusion must yield exactly 3x transfer reduction.")
    # ┌── 4. OUTPUTS (Formatting) ─────────────────────────────────────────────
    transfer_reduction_str = fmt(transfer_reduction, precision=0, commas=False)
    conv_bn_relu_mem_md = md_math(
        f"2 \\times 256 \\times 28 \\times 28 \\times 4 \\text{{ bytes}} \\approx \\text{{{conv_bn_relu_intermediate_mb_str} MB}}"
    )
 # ┌── EXPORTS (Bridge to Text) ─────────────────────────────────────────────────
 transfer_reduction_str = ConvFusionCalc.transfer_reduction_str
 conv_bn_relu_mem_md = ConvFusionCalc.conv_bn_relu_mem_md
 ```
 The arithmetic operations remain identical, but memory traffic drops from 6 transfers to 2 transfers (`{python} transfer_reduction_str` $\times$ reduction). For a ResNet-50 layer with 256 channels and spatial size $28 \times 28$, this eliminates `{python} conv_bn_relu_mem_md` of intermediate memory traffic per layer.
@@ -6276,7 +6381,6 @@ Unlike software functions that compose predictably, optimization techniques inte
 With the three optimization dimensions now fully explored, practitioners need systematic guidance for translating this knowledge into deployment decisions.
 ## Technique Selection {#sec-model-compression-technique-selection-ba16}
 An engineer deploying a transformer model faces a concrete decision: the model exceeds the target device's memory by 3 $\times$, inference latency is 4 $\times$ above the SLO, and the power budget allows no more than 2 W sustained. Should she quantize first, prune first, distill to a smaller architecture, or combine techniques? The answer depends on which constraint is binding, what accuracy loss is tolerable, and how much engineering time is available. This section provides structured guidance for navigating that decision.
@@ -6314,7 +6418,6 @@ These choices also depend on the available engineering budget. When fine-tuning
 This decision framework provides starting points for individual technique selection. Validating that a chosen technique actually achieves its intended goal requires systematic profiling and measurement, which @sec-model-compression-efficiency-measurement-2424 formalizes in detail. However, production deployments rarely rely on a single technique. Combining pruning with quantization, or distillation with hardware-aware design, introduces interaction effects that can either amplify benefits or create unexpected accuracy degradation. The following section addresses how to sequence and combine techniques effectively.
 ## Optimization Strategies {#sec-model-compression-optimization-strategies-f2f6}
 The decision framework above guides individual technique selection, but the largest optimization gains emerge from combining multiple techniques. Because pruning, quantization, and architectural efficiency operate at different levels of the stack, they provide multiplicative benefits when sequenced appropriately.
@@ -6528,7 +6631,6 @@ This example illustrates why sequencing matters: pruning first concentrates impo
 With dozens of techniques across three optimization dimensions, rigorous measurement is essential for validating that optimizations achieve their intended goals. A practitioner who prunes, quantizes, and fuses without profiling the actual impact on target hardware is optimizing blindly.
 ## Efficiency Measurement {#sec-model-compression-efficiency-measurement-2424}
 A model quantized to INT8 should be 4 $\times$ smaller and roughly 3 $\times$ faster, but does it actually achieve those gains on the target hardware? Theoretical compression ratios and measured deployment improvements often diverge, sometimes dramatically, because real speedups depend on memory hierarchy effects, kernel implementations, and hardware utilization patterns that theory alone cannot predict. Translating theoretical compression ratios into measurable deployment improvements therefore requires systematic profiling and evaluation.
@@ -6566,7 +6668,6 @@ With these comprehensive baselines in place, the measurement framework must trac
 Rigorous measurement tells practitioners *whether* their optimizations succeeded, but the measurements themselves require tooling to perform. Profiling, quantization, pruning, and deployment all depend on software frameworks that automate otherwise prohibitively complex workflows. We turn now to the implementation tools that make these techniques practical.
 ## Implementation Tools {#sec-model-compression-implementation-tools-4990}
 Understanding optimization techniques is necessary but not sufficient; practical implementation relies on robust software support. Without framework tooling, quantization would require manual modification of model definitions and careful insertion of quantization operations throughout the network, while pruning would demand direct manipulation of weight tensors. Both become prohibitively complex as models scale.
@@ -6655,7 +6756,6 @@ Sparsity heat maps show sparsity distribution across layers (@fig-sparse-heat-ma
 With the implementation tools and visualization capabilities established, the natural question is: how do these techniques compare when a practitioner must choose among them? Each optimization approach carries distinct trade-offs in accuracy, training cost, and hardware requirements, and a structured comparison clarifies which to reach for first.
 ## Technique Comparison {#sec-model-compression-technique-comparison-3142}
 A comparative analysis across the three major approaches reveals how each addresses distinct aspects of the efficiency-accuracy trade-off. Pruning works best when sparse computation hardware is available and when reducing floating-point operations is critical. Quantization provides the most versatile approach with broad hardware support, making it ideal for diverse deployment scenarios. Knowledge distillation requires significant computational investment but produces consistently high-quality compressed models, making it the right choice when accuracy preservation is paramount. @tbl-optimization-comparison summarizes these trade-offs for systematic technique selection.
@@ -6673,7 +6773,6 @@ These techniques combine synergistically, with quantization often applied after
 With the complete optimization toolkit now surveyed—from individual techniques through combination strategies—the most instructive lessons often come not from what works but from what fails. The following fallacies and pitfalls capture the most common mistakes engineers make when applying these techniques, each grounded in the quantitative trade-offs we have established throughout the chapter.
 ## Fallacies and Pitfalls {#sec-model-compression-fallacies-pitfalls-1b5e}
 ```{python}
@@ -6773,7 +6872,6 @@ Teams apply post-training quantization (PTQ) to avoid retraining and achieve 96.
 Teams achieve `{python} int8_size_reduction_str` $\times$ model size reduction through INT8 quantization and expect `{python} int8_size_reduction_str` $\times$ memory savings in deployment. In practice, runtime overhead erodes compression gains. Dequantization kernels add `{python} dequant_overhead_str`% latency overhead converting INT8 weights back to FP16. Pruned models with irregular sparsity achieve only 12% latency reduction despite `{python} param_removal_str`% parameter removal because hardware cannot skip zeroed weights efficiently. As @sec-model-compression-profiling-opportunity-analysis-477f demonstrates, a BERT model pruned to 50% sparsity and quantized to INT8 achieves `{python} actual_speedup_str`% end-to-end speedup rather than the expected `{python} expected_speedup_str` $\times$, because unstructured sparsity creates irregular memory access. Production workflows must profile *deployed* latency on target hardware, not extrapolate from compression ratios.
 ## Summary {#sec-model-compression-summary-8229}
 Model compression is not a bag of tricks but an engineering discipline built on three complementary dimensions: *structural optimization* determines what the model computes, *precision optimization* determines how precisely it computes, and *architectural optimization* determines how efficiently those computations execute on physical hardware. The most important lesson of this chapter is that these dimensions compose multiplicatively. Pruning alone might achieve 2 $\times$ compression; quantization alone might achieve 4 $\times$; but pruning, distillation, and quantization applied together can achieve 16 $\times$ — as BERT's compression from 440 MB to 28 MB demonstrates. The second lesson is equally important: theoretical compression ratios lie. A 4 $\times$ reduction in parameters translates to 4 $\times$ latency improvement only when the optimization aligns with the hardware's execution model. Unstructured sparsity on hardware that lacks sparse kernels achieves almost nothing; INT8 quantization on hardware without INT8 units achieves even less. Profile on target hardware, not paper metrics.
--- a/book/quarto/contents/vol2/backmatter/appendix_c3.qmd
+++ b/book/quarto/contents/vol2/backmatter/appendix_c3.qmd
@@ -21,6 +21,26 @@ When training throughput is low, check MFU, communication fraction, and goodput
 ```{python}
 #| label: appendix-c3-setup
 #| echo: false
 # ┌─────────────────────────────────────────────────────────────────────────────
 # │ C³ TAXONOMY — MASTER COMPUTATION
 # ├─────────────────────────────────────────────────────────────────────────────
 # │ Context: PERSISTENT — All values used throughout the C³ Taxonomy appendix:
 # │          @tbl-c3-dam-mapping, @tbl-c3-diagnostic-summary, @tbl-c3-traffic-light,
 # │          @tbl-c3-bottleneck-actions, three case studies, scorecard, and exercises.
 # │
 # │ Goal: Provide all C³ diagnostic constants — case study parameters, effective
 # │       FLOPS decomposition, and threshold strings — for the fleet-scale
 # │       bottleneck classification reference appendix.
 # │ Show: See individual section prose for formatted values. This cell provides
 # │       the physics; string attributes are display-ready.
 # │ How: calc_effective_flops() with MFU, scaling efficiency, and goodput ratio;
 # │      all results as raw floats extracted via .m_as() or .magnitude where unitless.
 # │
 # │ Imports: mlsys.constants (H100_FLOPS_FP16_TENSOR, MFU_*, SCALING_EFF_*, OVERHEAD_*, …)
 # │          mlsys.formulas (calc_effective_flops)
 # │          mlsys.formatting (fmt, check, md_math)
 # │ Exports: C3 = C3Taxonomy (accessed as C3.attribute in downstream cells)
 # └─────────────────────────────────────────────────────────────────────────────
 import math
 from mlsys.constants import (
@@ -35,15 +55,6 @@ from mlsys.constants import (
 from mlsys.formatting import fmt, check, md_math
 from mlsys.formulas import calc_effective_flops
 # =============================================================================
 # PURPOSE
 # =============================================================================
 # Purpose: Compute all values for the C³ Taxonomy appendix.
 # Used in: Case studies, effective FLOPS, scorecard, and inline prose.
 #
 # Philosophy: C³ parallels D·A·M — three MECE axes for fleet-scale diagnosis.
 # Every computed value traces back to constants.py.
 class C3Taxonomy:
    """Namespace for C³ diagnostic examples."""
@@ -71,7 +82,7 @@ class C3Taxonomy:
    case3_oh_maintenance_pct = OVERHEAD_MAINTENANCE * 100
    # Effective FLOPS calculation: 100K GPU cluster
-    h100_tflops = H100_FLOPS_FP16_TENSOR.to(TFLOPs / second).magnitude
+    h100_tflops = H100_FLOPS_FP16_TENSOR.m_as(TFLOPs / second)
    n_gpus_eff = 100_000
    peak_pflops = n_gpus_eff * h100_tflops / 1000  # PFLOPs
    goodput_all = 1.0 - (OVERHEAD_PIPELINE_BUBBLE +
@@ -80,7 +91,7 @@ class C3Taxonomy:
                         OVERHEAD_MAINTENANCE)
    effective_pflops = calc_effective_flops(
        peak_pflops, MFU_TRAINING_HIGH, SCALING_EFF_8192GPU, goodput_all
-    )
+    ).magnitude  # extract float; calc_effective_flops returns Quantity since formulas.py upgrade
    c3_tax = peak_pflops / effective_pflops
    eff_fraction = effective_pflops / peak_pflops
@@ -445,12 +456,8 @@ The gap between scaling-law predictions and observed training outcomes is, in la
 ```{python}
 #| label: appendix-c3-effective-flops
 #| echo: false
-
+# Goal: Alias C3Taxonomy strings for the 100K-GPU effective FLOPS callout prose.
-# =============================================================================
+# Exports: peak_str, eff_str, eff_pct_str, c3_tax_str, mfu_str, scaling_str, goodput_str
 # PURPOSE
 # =============================================================================
 # Purpose: Format effective FLOPS values for the worked example.
 # Used in: Effective FLOPS worked example prose.
 peak_str = C3.peak_pflops_str
 eff_str = C3.effective_pflops_str
--- a/book/quarto/contents/vol2/backmatter/appendix_fleet.qmd
+++ b/book/quarto/contents/vol2/backmatter/appendix_fleet.qmd
@@ -15,6 +15,23 @@ This appendix collects the reference numbers and compact models for fleet-scale
 ```{python}
 #| label: appendix-fleet-setup
 #| echo: false
 # ┌─────────────────────────────────────────────────────────────────────────────
 # │ FLEET FOUNDATIONS — MASTER COMPUTATION
 # ├─────────────────────────────────────────────────────────────────────────────
 # │ Context: PERSISTENT — All values used throughout the Fleet Foundations
 # │          appendix: hardware reference table, MTBF tables, checkpoint sizing,
 # │          effective FLOPS, comm-compute ratio, and all prose inline values.
 # │
 # │ Goal: Provide all quantitative fleet engineering constants in one place
 # │       for the "Numbers Every Fleet Engineer Should Know" reference appendix.
 # │ Show: See individual section cells for formatted values. This cell provides
 # │       the physics; formatting cells convert to display strings.
 # │ How: pint Quantities from mlsys.constants; fleet formulas from formulas.py;
 # │      all results as typed Quantities or raw floats via .m_as().
 # │
 # │ Imports: mlsys.constants (*), mlsys.formulas (calc_*), mlsys.formatting (fmt, check)
 # │ Exports: FF = FleetFoundations (accessed as FF.attribute in downstream cells)
 # └─────────────────────────────────────────────────────────────────────────────
 import math
 from mlsys.constants import *
@@ -26,27 +43,13 @@ from mlsys.formulas import (
    calc_young_daly_interval, calc_checkpoint_size
 )
 # =============================================================================
 # PURPOSE
 # =============================================================================
 # Purpose: Compute all values for the Fleet Foundations appendix.
 # Used in: Reference tables, worked examples, and inline prose throughout.
 #
 # Philosophy: Fleet-scale numbers emphasize RATIOS between tiers and
 #             SCALING BEHAVIOR with cluster size. Absolute values are
 #             current-generation snapshots; ratios persist across generations.
 # =============================================================================
 # NETWORK HIERARCHY
 # =============================================================================
 class FleetFoundations:
    """Namespace for fleet-scale reference calculations."""
    # ── Communication Numbers ────────────────────────────────────────────────
    # Bandwidth hierarchy (GB/s)
-    nvlink_h100_bw = int(NVLINK_H100_BW.to(GB / second).magnitude)
+    nvlink_h100_bw = int(NVLINK_H100_BW.m_as(GB / second))
-    pcie5_bw = int(PCIE_GEN5_BW.to(GB / second).magnitude)
+    pcie5_bw = int(PCIE_GEN5_BW.m_as(GB / second))
    ib_ndr_bw = INFINIBAND_NDR_BW_GBS
    ib_hdr_bw = INFINIBAND_HDR_BW_GBS
    ib_xdr_bw = INFINIBAND_XDR_BW_GBS
@@ -95,28 +98,29 @@ class FleetFoundations:
    mtbf_100k_h = calc_mtbf_cluster(GPU_MTTF_HOURS, cl_mega)
    # Convert to minutes for readability
-    mtbf_256_min = mtbf_256_h * 60
+    mtbf_256_min = mtbf_256_h.m_as(ureg.minute)
-    mtbf_2048_min = mtbf_2048_h * 60
+    mtbf_2048_min = mtbf_2048_h.m_as(ureg.minute)
-    mtbf_8192_min = mtbf_8192_h * 60
+    mtbf_8192_min = mtbf_8192_h.m_as(ureg.minute)
-    mtbf_100k_min = mtbf_100k_h * 60
+    mtbf_100k_min = mtbf_100k_h.m_as(ureg.minute)
-    # Failure probability for a 24-hour job (using hours consistently)
+    # Failure probability for a 24-hour job
-    pfail_256_24h = calc_failure_probability(mtbf_256_h, 24)
+    _24h = 24 * ureg.hour
-    pfail_2048_24h = calc_failure_probability(mtbf_2048_h, 24)
+    pfail_256_24h = calc_failure_probability(mtbf_256_h, _24h)
-    pfail_8192_24h = calc_failure_probability(mtbf_8192_h, 24)
+    pfail_2048_24h = calc_failure_probability(mtbf_2048_h, _24h)
-    pfail_100k_24h = calc_failure_probability(mtbf_100k_h, 24)
+    pfail_8192_24h = calc_failure_probability(mtbf_8192_h, _24h)
    pfail_100k_24h = calc_failure_probability(mtbf_100k_h, _24h)
-    # Checkpoint sizes (bytes)
+    # Checkpoint sizes
-    ckpt_7b = calc_checkpoint_size(7e9)
+    ckpt_7b = calc_checkpoint_size(7e9)    # Quantity[byte]
    ckpt_70b = calc_checkpoint_size(70e9)
    ckpt_175b = calc_checkpoint_size(175e9)
    ckpt_1t = calc_checkpoint_size(1e12)
-    # Convert to GB
+    # Extract in GB/TB
-    ckpt_7b_gb = ckpt_7b / 1e9
+    ckpt_7b_gb = ckpt_7b.m_as(GB)
-    ckpt_70b_gb = ckpt_70b / 1e9
+    ckpt_70b_gb = ckpt_70b.m_as(GB)
-    ckpt_175b_gb = ckpt_175b / 1e9
+    ckpt_175b_gb = ckpt_175b.m_as(GB)
-    ckpt_1t_tb = ckpt_1t / 1e12
+    ckpt_1t_tb = ckpt_1t.m_as(TB)
    # Overhead budgets
    oh_pipeline = int(OVERHEAD_PIPELINE_BUBBLE * 100)
@@ -125,20 +129,20 @@ class FleetFoundations:
    oh_maintenance = int(OVERHEAD_MAINTENANCE * 100)
    # ── Hardware Reference ───────────────────────────────────────────────────
-    h100_flops = int(H100_FLOPS_FP16_TENSOR.to(TFLOPs / second).magnitude)
+    h100_flops = int(H100_FLOPS_FP16_TENSOR.m_as(TFLOPs / second))
-    h100_bw_tbs = f"{H100_MEM_BW.to(TB / second).magnitude:.2f}"
+    h100_bw_tbs = f"{H100_MEM_BW.m_as(TB / second):.2f}"
-    h100_cap = int(H100_MEM_CAPACITY.to(GiB).magnitude)
+    h100_cap = int(H100_MEM_CAPACITY.m_as(GiB))
-    h100_tdp = int(H100_TDP.magnitude)
+    h100_tdp = int(H100_TDP.m_as(watt))
-    b200_flops = int(B200_FLOPS_FP16_TENSOR.to(TFLOPs / second).magnitude)
+    b200_flops = int(B200_FLOPS_FP16_TENSOR.m_as(TFLOPs / second))
-    b200_bw_tbs = f"{B200_MEM_BW.to(TB / second).magnitude:.0f}"
+    b200_bw_tbs = f"{B200_MEM_BW.m_as(TB / second):.0f}"
-    b200_cap = int(B200_MEM_CAPACITY.to(GiB).magnitude)
+    b200_cap = int(B200_MEM_CAPACITY.m_as(GiB))
-    b200_tdp = int(B200_TDP.magnitude)
+    b200_tdp = int(B200_TDP.m_as(watt))
-    tpuv5_flops = int(TPUV5P_FLOPS_BF16.to(TFLOPs / second).magnitude)
+    tpuv5_flops = int(TPUV5P_FLOPS_BF16.m_as(TFLOPs / second))
-    tpuv5_bw_tbs = f"{TPUV5P_MEM_BW.to(TB / second).magnitude:.2f}"
+    tpuv5_bw_tbs = f"{TPUV5P_MEM_BW.m_as(TB / second):.2f}"
-    tpuv5_cap = int(TPUV5P_MEM_CAPACITY.to(GiB).magnitude)
+    tpuv5_cap = int(TPUV5P_MEM_CAPACITY.m_as(GiB))
-    tpuv5_ici = int(TPUV5P_ICI_BW.to(GB / second).magnitude)
+    tpuv5_ici = int(TPUV5P_ICI_BW.m_as(GB / second))
    # ── Power and Sustainability ─────────────────────────────────────────────
    rack_trad = RACK_POWER_TRADITIONAL_KW
@@ -154,17 +158,19 @@ class FleetFoundations:
    # ── Effective FLOPS Example ──────────────────────────────────────────────
    # 1024-GPU cluster, H100, realistic overheads
-    peak_1024 = 1024 * H100_FLOPS_FP16_TENSOR.to(TFLOPs / second).magnitude
+    _peak_1024_qty = 1024 * H100_FLOPS_FP16_TENSOR        # Quantity[TFLOPs/s]
    peak_1024 = _peak_1024_qty.m_as(TFLOPs / second)      # raw float for display
    goodput_ratio = 1.0 - (OVERHEAD_PIPELINE_BUBBLE +
                           OVERHEAD_CHECKPOINT +
                           OVERHEAD_FAILURE_RECOVERY +
                           OVERHEAD_MAINTENANCE)
-    eff_flops_1024 = calc_effective_flops(
+    _eff_flops_1024_qty = calc_effective_flops(
-        peak_1024,
+        _peak_1024_qty,
        MFU_TRAINING_HIGH,
        SCALING_EFF_1024GPU,
        goodput_ratio
-    )
+    )  # Quantity[flop/second]
    eff_flops_1024 = _eff_flops_1024_qty.m_as(TFLOPs / second)  # raw float for display
    eff_fraction = eff_flops_1024 / peak_1024
    # ── Invariant Checks ─────────────────────────────────────────────────────
@@ -289,12 +295,8 @@ Communication defines the boundaries of parallelism. These tables quantify the b
 ```{python}
 #| label: fleet-comm-numbers
 #| echo: false
-
+# Goal: Format communication bandwidth and latency strings for @tbl-fleet-bandwidth-hierarchy and @tbl-fleet-latency-hierarchy.
-# =============================================================================
+# Exports: nvlink_bw_str, pcie5_bw_str, ib_*_str, tpuv5_ici_str, nvlink_to_ib_str, *_lat_str
 # PURPOSE
 # =============================================================================
 # Purpose: Compute communication hierarchy values for inline references.
 # Used in: Communication numbers tables and prose.
 # ── Bandwidth ratios ────────────────────────────────────────────────────────
 nvlink_bw_str = fmt(FF.nvlink_h100_bw, precision=0)
@@ -386,12 +388,8 @@ At fleet scale, coordination---failure recovery, checkpointing, and maintenance-
 ```{python}
 #| label: fleet-mtbf-table
 #| echo: false
-
+# Goal: Format MTBF hours, minutes, and P(failure) percentages for @tbl-fleet-mtbf.
-# =============================================================================
+# Exports: mtbf_256_str, mtbf_2048_str, mtbf_8192_str, mtbf_100k_str, mtbf_*_min_str, pfail_*_str
 # PURPOSE
 # =============================================================================
 # Purpose: Format MTBF and failure probability values for the table.
 # Used in: MTBF by cluster size table.
 mtbf_256_str = fmt(FF.mtbf_256_h, precision=1, commas=False)
 mtbf_2048_str = fmt(FF.mtbf_2048_h, precision=1, commas=False)
@@ -432,12 +430,8 @@ Checkpointing is the primary recovery mechanism, and its cost depends on the mod
 ```{python}
 #| label: fleet-checkpoint-sizes
 #| echo: false
-
+# Goal: Format checkpoint sizes in GB/TB for @tbl-fleet-checkpoint-sizes.
-# =============================================================================
+# Exports: ckpt_7b_str, ckpt_70b_str, ckpt_175b_str, ckpt_1t_str
 # PURPOSE
 # =============================================================================
 # Purpose: Format checkpoint sizes for the reference table.
 # Used in: Checkpoint size table.
 ckpt_7b_str = fmt(FF.ckpt_7b_gb, precision=0)
 ckpt_70b_str = fmt(FF.ckpt_70b_gb, precision=0)
@@ -484,12 +478,8 @@ These numbers reflect the current generation of fleet-scale hardware. Use them f
 ```{python}
 #| label: fleet-hardware-ref
 #| echo: false
-
+# Goal: Format H100, B200, and TPU v5p specs for @tbl-fleet-hardware-ref.
-# =============================================================================
+# Exports: h100_flops_str, h100_bw_str, h100_cap_str, h100_tdp_str, b200_*, tpuv5_*
 # PURPOSE
 # =============================================================================
 # Purpose: Format hardware reference values for the comparison table.
 # Used in: Current hardware reference table.
 h100_flops_str = fmt(FF.h100_flops, precision=0)
 h100_bw_str = FF.h100_bw_tbs
@@ -547,36 +537,52 @@ Volume I introduced Amdahl's Law for a single machine, where the serial fraction
 ```{python}
 #| label: fleet-amdahl-example
 #| echo: false
 # ┌─────────────────────────────────────────────────────────────────────────────
 # │ FLEET AMDAHL EXAMPLE
 # ├─────────────────────────────────────────────────────────────────────────────
 # │ Context: @sec-fleet-foundations-amdahls-fleet worked example
 # │
 # │ Goal: Compute Amdahl's Law speedup at 32/256/1024/8192 GPUs for 10% serial fraction.
 # │ Show: Speedup values and the Amdahl ceiling for inline prose.
 # │ How: calc_amdahls_speedup() from formulas.py; check() for invariants.
 # │
 # │ Imports: mlsys.formulas (calc_amdahls_speedup), mlsys.formatting (fmt, check)
 # │ Exports: s_fleet_pct_str, max_speedup_str, su_32_str, su_256_str, su_1024_str, su_8192_str
 # └─────────────────────────────────────────────────────────────────────────────
-# =============================================================================
+class FleetAmdahlExample:
-# PURPOSE
+    """Amdahl's Law speedup at 32/256/1024/8192 GPUs for 10% serial fraction."""
 # =============================================================================
 # Purpose: Compute Amdahl's Law examples at fleet scale.
 # Used in: Amdahl's Law at Fleet Scale worked example.
-# ── PARAMETERS ──────────────────────────────────────────────────────────────
+    # ┌── 1. PARAMETERS (Inputs) ──────────────────────────────────────────────
-s_fleet = 0.10       # 10% serial fraction (communication + sync)
+    s_fleet  = 0.10
-n_values = [32, 256, 1024, 8192]
+    n_values = [32, 256, 1024, 8192]
-# ── CALCULATION ─────────────────────────────────────────────────────────────
+    # ┌── 2. CALCULATION (The Physics) ────────────────────────────────────────
-speedups = {}
+    speedups = {}
-for n in n_values:
+    for _n in n_values:
-    su = calc_amdahls_speedup(1 - s_fleet, n)
+        speedups[_n] = calc_amdahls_speedup(1 - s_fleet, _n)
    speedups[n] = su
-max_speedup = 1 / s_fleet
+    max_speedup = 1 / s_fleet
-# ── INVARIANTS ──────────────────────────────────────────────────────────────
+    # ┌── 3. INVARIANTS (Guardrails) ──────────────────────────────────────────
-check(speedups[8192] < max_speedup, "Speedup must be below Amdahl limit")
+    check(speedups[8192] < max_speedup, "Speedup must be below Amdahl limit")
-check(max_speedup == 10.0, "Max speedup for 10% serial fraction must be 10x")
+    check(max_speedup == 10.0, "Max speedup for 10% serial fraction must be 10x")
-# ── OUTPUTS ─────────────────────────────────────────────────────────────────
+    # ┌── 4. OUTPUTS (Formatting) ─────────────────────────────────────────────
-s_fleet_pct_str = "10"
+    s_fleet_pct_str = "10"
-max_speedup_str = fmt(max_speedup, precision=0, commas=False)
+    max_speedup_str = fmt(max_speedup, precision=0, commas=False)
-su_32_str = fmt(speedups[32], precision=1, commas=False)
+    su_32_str       = fmt(speedups[32],   precision=1, commas=False)
-su_256_str = fmt(speedups[256], precision=1, commas=False)
+    su_256_str      = fmt(speedups[256],  precision=1, commas=False)
-su_1024_str = fmt(speedups[1024], precision=1, commas=False)
+    su_1024_str     = fmt(speedups[1024], precision=1, commas=False)
-su_8192_str = fmt(speedups[8192], precision=1, commas=False)
+    su_8192_str     = fmt(speedups[8192], precision=1, commas=False)
 # ┌── EXPORTS (Bridge to Text) ─────────────────────────────────────────────────
 s_fleet_pct_str = FleetAmdahlExample.s_fleet_pct_str
 max_speedup_str = FleetAmdahlExample.max_speedup_str
 su_32_str       = FleetAmdahlExample.su_32_str
 su_256_str      = FleetAmdahlExample.su_256_str
 su_1024_str     = FleetAmdahlExample.su_1024_str
 su_8192_str     = FleetAmdahlExample.su_8192_str
 ```
 To see the fleet-scale implications, consider a training workload where `{python} s_fleet_pct_str`% of wall-clock time is spent in synchronization, communication, and other serial overhead. Amdahl's Law gives the following speedups:
@@ -604,58 +610,72 @@ When $\rho < 1$, computation dominates and communication can be overlapped. When
 ```{python}
 #| label: fleet-comm-comp-ratio
 #| echo: false
 # ┌─────────────────────────────────────────────────────────────────────────────
 # │ FLEET COMM-COMPUTE RATIO
 # ├─────────────────────────────────────────────────────────────────────────────
 # │ Context: @sec-fleet-foundations-comm-compute-ratio worked example (@tbl-fleet-comm-comp)
 # │
 # │ Goal: Compute ρ = T_comm / T_comp for 3 scenarios: 7B DP, 350M DP, tensor-parallel.
 # │ Show: AllReduce times in ms and ρ ratios for each scenario; ~0.1 for DP 7B, ~3 for DP 350M.
 # │ How: calc_ring_allreduce_time() with IB NDR params; NVLink BW for tensor-parallel.
 # │
 # │ Imports: mlsys.constants (INFINIBAND_NDR_BW_GBS, IB_NDR_LATENCY_US, NVLINK_H100_BW, GB, second)
 # │ Exports: ar_7b_ms_str, rho_7b_str, ar_350m_ms_str, rho_350m_str, rho_tp_str
 # └─────────────────────────────────────────────────────────────────────────────
-# =============================================================================
+class FleetCommCompRatio:
-# PURPOSE
+    """Communication-to-computation ratio ρ for three parallelism scenarios."""
 # =============================================================================
 # Purpose: Compute communication-computation ratios for different scenarios.
 # Used in: Communication-computation ratio worked example.
-# ── SCENARIO 1: Data parallelism, large model ──────────────────────────────
+    # ── SCENARIO 1: Data parallelism, large model ──────────────────────────
-# 7B model, 256 GPUs, IB NDR
+    # 7B model, 256 GPUs, IB NDR
-grad_bytes_7b = 7e9 * 2           # 7B params * 2 bytes (BF16 gradients)
+    grad_bytes_7b = 7e9 * 2           # 7B params * 2 bytes (BF16 gradients)
-allreduce_time_7b = calc_ring_allreduce_time(
+    allreduce_time_7b = calc_ring_allreduce_time(
-    message_bytes=grad_bytes_7b,
+        message_bytes=grad_bytes_7b,
-    n_gpus=256,
+        n_gpus=256,
-    bandwidth_bytes_s=INFINIBAND_NDR_BW_GBS * 1e9,
+        bandwidth_bytes_s=INFINIBAND_NDR_BW_GBS * 1e9,
-    latency_s=IB_NDR_LATENCY_US * 1e-6
+        latency_s=IB_NDR_LATENCY_US * 1e-6
-)
+    )  # Quantity[second]
-# Computation time: assume ~50ms forward+backward per step
+    comp_time_7b = 0.050  # 50 ms (seconds)
-comp_time_7b = 0.050  # 50 ms
+    rho_7b = allreduce_time_7b.m_as(ureg.second) / comp_time_7b
 rho_7b = allreduce_time_7b / comp_time_7b
-# ── SCENARIO 2: Data parallelism, small model ──────────────────────────────
+    # ── SCENARIO 2: Data parallelism, small model ──────────────────────────
-# 350M model, 256 GPUs, IB NDR
+    # 350M model, 256 GPUs, IB NDR
-grad_bytes_350m = 350e6 * 2       # 350M params * 2 bytes
+    grad_bytes_350m = 350e6 * 2       # 350M params * 2 bytes
-allreduce_time_350m = calc_ring_allreduce_time(
+    allreduce_time_350m = calc_ring_allreduce_time(
-    message_bytes=grad_bytes_350m,
+        message_bytes=grad_bytes_350m,
-    n_gpus=256,
+        n_gpus=256,
-    bandwidth_bytes_s=INFINIBAND_NDR_BW_GBS * 1e9,
+        bandwidth_bytes_s=INFINIBAND_NDR_BW_GBS * 1e9,
-    latency_s=IB_NDR_LATENCY_US * 1e-6
+        latency_s=IB_NDR_LATENCY_US * 1e-6
-)
+    )  # Quantity[second]
-comp_time_350m = 0.005  # 5 ms (smaller model)
+    comp_time_350m = 0.005  # 5 ms (seconds, smaller model)
-rho_350m = allreduce_time_350m / comp_time_350m
+    rho_350m = allreduce_time_350m.m_as(ureg.second) / comp_time_350m
-# ── SCENARIO 3: Tensor parallelism, within node ────────────────────────────
+    # ── SCENARIO 3: Tensor parallelism, within node ────────────────────────
-# Activation transfer: 8 GPUs, NVLink, ~16 MB per layer
+    # Activation transfer: 8 GPUs, NVLink, ~16 MB per layer
-act_bytes = 16e6  # 16 MB
+    act_bytes = 16e6  # 16 MB
-act_transfer_time = act_bytes / (NVLINK_H100_BW.to(GB / second).magnitude * 1e9)
+    act_transfer_time = act_bytes / (NVLINK_H100_BW.m_as(GB / second) * 1e9)
-comp_time_layer = 0.001  # 1 ms per layer
+    comp_time_layer = 0.001  # 1 ms per layer
-rho_tp = act_transfer_time / comp_time_layer
+    rho_tp = act_transfer_time / comp_time_layer
-# ── INVARIANTS ──────────────────────────────────────────────────────────────
+    # ── INVARIANTS ──────────────────────────────────────────────────────────
-check(rho_7b > 0.1, "7B comm ratio must be non-trivial")
+    check(rho_7b > 0.1, "7B comm ratio must be non-trivial")
-check(rho_350m > 0.01, "350M comm ratio must be non-trivial")
+    check(rho_350m > 0.01, "350M comm ratio must be non-trivial")
-# ── OUTPUTS ─────────────────────────────────────────────────────────────────
+    # ── OUTPUTS ─────────────────────────────────────────────────────────────
-ar_7b_ms_str = fmt(allreduce_time_7b * 1000, precision=1, commas=False)
+    ar_7b_ms_str   = fmt(allreduce_time_7b.m_as(ureg.millisecond),   precision=1, commas=False)
-rho_7b_str = fmt(rho_7b, precision=2, commas=False)
+    rho_7b_str     = fmt(rho_7b,   precision=2, commas=False)
    ar_350m_ms_str = fmt(allreduce_time_350m.m_as(ureg.millisecond), precision=1, commas=False)
    rho_350m_str   = fmt(rho_350m, precision=1, commas=False)
    rho_tp_str     = fmt(rho_tp,   precision=3, commas=False)
-ar_350m_ms_str = fmt(allreduce_time_350m * 1000, precision=1, commas=False)
+# ┌── EXPORTS (Bridge to Text) ─────────────────────────────────────────────────
-rho_350m_str = fmt(rho_350m, precision=1, commas=False)
+ar_7b_ms_str   = FleetCommCompRatio.ar_7b_ms_str
-
+rho_7b_str     = FleetCommCompRatio.rho_7b_str
-rho_tp_str = fmt(rho_tp, precision=3, commas=False)
+rho_7b         = FleetCommCompRatio.rho_7b          # raw float used in fmt() call in prose
 ar_350m_ms_str = FleetCommCompRatio.ar_350m_ms_str
 rho_350m_str   = FleetCommCompRatio.rho_350m_str
 rho_tp_str     = FleetCommCompRatio.rho_tp_str
 ```
@tbl-fleet-comm-comp shows the ratio for three representative scenarios. The contrast between them reveals why parallelism strategy must match the workload.
@@ -685,12 +705,8 @@ The key insight for fleet-scale ML is that weak scaling is not just a mathematic
 ```{python}
 #| label: fleet-effective-flops
 #| echo: false
-
+# Goal: Format peak and effective FLOPS for the 1,024-GPU compound loss callout.
-# =============================================================================
+# Exports: peak_str, eff_str, eff_pct_str, goodput_pct_str, mfu_pct_str, scaling_pct_str
 # PURPOSE
 # =============================================================================
 # Purpose: Compute effective FLOPS for the compound loss example.
 # Used in: Effective FLOPS worked example.
 peak_str = fmt(FF.peak_1024, precision=0)
 eff_str = fmt(FF.eff_flops_1024, precision=0)
--- a/book/quarto/contents/vol2/backmatter/appendix_reliability.qmd
+++ b/book/quarto/contents/vol2/backmatter/appendix_reliability.qmd
@@ -35,6 +35,28 @@ This appendix is designed as a *reference*. Use it when you need to move from in
 ```{python}
 #| label: appendix-reliability-setup
 #| echo: false
 # ┌─────────────────────────────────────────────────────────────────────────────
 # │ RELIABILITY FOUNDATIONS — MASTER COMPUTATION
 # ├─────────────────────────────────────────────────────────────────────────────
 # │ Context: PERSISTENT — All values used throughout the Reliability Foundations
 # │          appendix: @tbl-component-fit, @tbl-mtbf-cluster, @tbl-failure-prob,
 # │          @tbl-checkpoint-size, @tbl-recovery-anatomy, @tbl-strategy-comparison,
 # │          @tbl-availability-stacking, and all Young-Daly worked examples.
 # │
 # │ Goal: Provide all reliability constants — FIT rates, MTBF cascade, Young-Daly
 # │       optimal checkpoint interval, recovery anatomy, and availability stacking —
 # │       for the "Failure as a Physical Constraint" reference appendix.
 # │ Show: See individual section cells for formatted values. This cell provides
 # │       the physics; formatting cells and f-strings convert to display strings.
 # │ How: pint Quantities from mlsys.constants; calc_mtbf_node, calc_mtbf_cluster,
 # │      calc_young_daly_interval, calc_failure_probability, calc_checkpoint_size,
 # │      calc_availability_stacked from formulas.py; all extractions via .m_as().
 # │
 # │ Imports: mlsys.constants (*), mlsys.formulas (calc_mtbf_*, calc_young_daly_interval,
 # │          calc_failure_probability, calc_checkpoint_size, calc_availability_stacked)
 # │          mlsys.formatting (fmt, check)
 # │ Exports: R = ReliabilityFoundations (accessed as R.attribute in downstream cells)
 # └─────────────────────────────────────────────────────────────────────────────
 from mlsys.constants import *
 from mlsys.formatting import fmt, check
@@ -103,8 +125,9 @@ class ReliabilityFoundations:
    @classmethod
    def p_failure(cls, n_gpus, duration_hours):
-        mtbf_h = cls.cluster_mtbf(n_gpus)
+        mtbf_h = cls.cluster_mtbf(n_gpus)              # Quantity[hour]
-        return calc_failure_probability(mtbf_h, duration_hours)
+        dur_h = duration_hours * ureg.hour              # attach unit
        return calc_failure_probability(mtbf_h, dur_h)
    # ┌── 5. CHECKPOINT SIZING ────────────────────────────────────────
    # Mixed-precision Adam: 16 bytes/param
@@ -114,25 +137,28 @@ class ReliabilityFoundations:
    @classmethod
    def ckpt_size_gb(cls, n_params):
-        return calc_checkpoint_size(n_params, cls.bytes_per_param) / 1e9
+        return calc_checkpoint_size(n_params, cls.bytes_per_param).m_as(GB)
    # ┌── 6. YOUNG-DALY (10K cluster, 175B model) ────────────────────
-    ckpt_175b_bytes = calc_checkpoint_size(175e9, 16)
+    ckpt_175b_bytes = calc_checkpoint_size(175e9, 16)    # Quantity[byte]
-    ckpt_175b_gb = ckpt_175b_bytes / 1e9
+    ckpt_175b_gb = ckpt_175b_bytes.m_as(GB)              # raw float in GB
-    ckpt_write_bw = CHECKPOINT_WRITE_BW_GBS  # GB/s
+    ckpt_write_bw = CHECKPOINT_WRITE_BW_GBS              # GB/s (raw float)
-    ckpt_write_time_s = ckpt_175b_gb / ckpt_write_bw
+    ckpt_write_time_s = ckpt_175b_gb / ckpt_write_bw     # raw float (seconds)
-    cluster_mtbf_10k_s = cluster_mtbf_10k * SEC_PER_HOUR
+    cluster_mtbf_10k_s = cluster_mtbf_10k.m_as(ureg.second)  # raw float (seconds)
-    tau_opt_s = calc_young_daly_interval(ckpt_write_time_s, cluster_mtbf_10k_s)
+    tau_opt_s = calc_young_daly_interval(ckpt_write_time_s, cluster_mtbf_10k_s)  # Quantity[second]
-    tau_opt_min = tau_opt_s / SECONDS_PER_MINUTE
+    tau_opt_min = tau_opt_s.m_as(ureg.minute)  # raw float in minutes
    # ┌── 7. RECOVERY TIME ───────────────────────────────────────────
-    t_detect = HEARTBEAT_TIMEOUT_S
+    t_detect = HEARTBEAT_TIMEOUT_S      # raw float (seconds) — kept for table display
-    t_reschedule = RESCHEDULE_TIME_S
+    t_reschedule = RESCHEDULE_TIME_S    # raw float (seconds) — kept for table display
-    t_reload_s = ckpt_write_time_s  # same BW, same size
+    t_reload_s = ckpt_write_time_s      # raw float (seconds)
    # Replay: half the interval on average
-    t_replay_s = tau_opt_s / 2
+    t_replay_s = tau_opt_s / 2          # Quantity[second]
-    t_recovery_total_s = t_detect + t_reschedule + t_reload_s + t_replay_s
+    # Sum: attach units to raw seconds, then extract in minutes
    t_recovery_total_s = (
        (t_detect + t_reschedule + t_reload_s) * ureg.second + t_replay_s
    ).m_as(ureg.minute)  # raw float in minutes
    # ┌── 8. GOODPUT ─────────────────────────────────────────────────
    overhead_ckpt = OVERHEAD_CHECKPOINT
@@ -150,8 +176,8 @@ class ReliabilityFoundations:
 R = ReliabilityFoundations  # short alias for inline use
 # ┌── INVARIANTS ──────────────────────────────────────────────────────
-check(R.cluster_mtbf_10k < 5.0,
+check(R.cluster_mtbf_10k.m_as(ureg.hour) < 5.0,
-      f"10K cluster MTBF should be < 5 hours, got {R.cluster_mtbf_10k:.2f}")
+      f"10K cluster MTBF should be < 5 hours, got {R.cluster_mtbf_10k.m_as(ureg.hour):.2f}")
 check(R.tau_opt_min > 5 and R.tau_opt_min < 60,
      f"Young-Daly interval should be 5-60 min, got {R.tau_opt_min:.1f}")
 check(R.p_failure(10_000, 24) > 0.99,
@@ -159,12 +185,12 @@ check(R.p_failure(10_000, 24) > 0.99,
 # ┌── FORMATTED OUTPUTS ──────────────────────────────────────────────
 gpu_mttf_str = fmt(R.gpu_mttf, precision=0)
-node_mtbf_str = fmt(R.node_mtbf, precision=0)
+node_mtbf_str = fmt(R.node_mtbf.m_as(ureg.hour), precision=0)
-cluster_mtbf_10k_str = fmt(R.cluster_mtbf_10k, precision=2)
+cluster_mtbf_10k_str = fmt(R.cluster_mtbf_10k.m_as(ureg.hour), precision=2)
 tau_opt_min_str = fmt(R.tau_opt_min, precision=1)
 ckpt_175b_gb_str = fmt(R.ckpt_175b_gb, precision=0)
 ckpt_write_time_str = fmt(R.ckpt_write_time_s, precision=1)
-t_recovery_str = fmt(R.t_recovery_total_s / SECONDS_PER_MINUTE, precision=1)
+t_recovery_str = fmt(R.t_recovery_total_s, precision=1)
 ```
 ## Failure Probability at Scale {#sec-reliability-foundations-failure-probability}
@@ -188,8 +214,8 @@ $$ \text{MTTF} = \frac{10^9}{\text{FIT}} $$ {#eq-mttf-from-fit}
 ```{python}
 #| label: component-fit-table
 #| echo: false
-
+# Goal: Format per-component MTTF in years for @tbl-component-fit.
-# Format component data for the table
+# Exports: gpu_mttf_yr, hbm_mttf_yr, nic_mttf_yr, psu_mttf_yr, pcie_mttf_yr, cable_mttf_yr, tor_mttf_yr
 gpu_mttf_yr = f"{R.gpu_mttf / HOURS_PER_YEAR:.1f}"
 hbm_mttf_yr = f"{R.hbm_mttf / HOURS_PER_YEAR:.1f}"
 nic_mttf_yr = f"{R.nic_mttf / HOURS_PER_YEAR:.1f}"
@@ -233,24 +259,24 @@ For a cluster of $N$ identical nodes, the same logic applies one level up:
 $$ \text{MTBF}_\text{cluster} = \frac{\text{MTBF}_\text{node}}{N} $$ {#eq-mtbf-cluster}
-This is the **MTBF cascade**: reliability degrades linearly with component count at each level, and the levels compound. A node with `{python} f"{R.node_mtbf:,.0f}"`-hour MTBF sounds reliable. A cluster of `{python} f"{R.nodes_10k:,}"` such nodes has an MTBF of just `{python} cluster_mtbf_10k_str` hours---a failure every few hours is the expected steady state.
+This is the **MTBF cascade**: reliability degrades linearly with component count at each level, and the levels compound. A node with `{python} f"{R.node_mtbf.m_as(ureg.hour):,.0f}"`-hour MTBF sounds reliable. A cluster of `{python} f"{R.nodes_10k:,}"` such nodes has an MTBF of just `{python} cluster_mtbf_10k_str` hours---a failure every few hours is the expected steady state.
@tbl-mtbf-cluster shows how cluster MTBF shrinks as fleet size grows.
 ```{python}
 #| label: mtbf-cluster-table
 #| echo: false
-
+# Goal: Build MTBF row data (hours or minutes, failures/day) for @tbl-mtbf-cluster.
-# Build MTBF table data
+# Exports: mtbf_data list of dicts with "gpus", "nodes", "mtbf", "per_day" keys
 mtbf_data = []
 for n_gpus in R.cluster_sizes:
    n_nodes = R.nodes_for_gpus(n_gpus)
-    mtbf_h = R.cluster_mtbf(n_gpus)
+    mtbf_h_val = R.cluster_mtbf(n_gpus).m_as(ureg.hour)  # raw float in hours
-    if mtbf_h >= 1.0:
+    if mtbf_h_val >= 1.0:
-        mtbf_str = f"{mtbf_h:.1f} hours"
+        mtbf_str = f"{mtbf_h_val:.1f} hours"
    else:
-        mtbf_str = f"{mtbf_h * SECONDS_PER_MINUTE:.0f} minutes"
+        mtbf_str = f"{mtbf_h_val * 60:.0f} minutes"
-    per_day = 24 / mtbf_h
+    per_day = 24 / mtbf_h_val
    mtbf_data.append({
        "gpus": f"{n_gpus:,}",
        "nodes": f"{n_nodes:,}",
@@ -292,8 +318,8 @@ When $T_\text{job} \gg \text{MTBF}$, this probability approaches 1 rapidly. @tbl
 ```{python}
 #| label: failure-probability-table
 #| echo: false
-
+# Goal: Compute P(≥1 failure) matrix for @tbl-failure-prob across cluster sizes and job durations.
-# Build failure probability matrix
+# Exports: fp_data dict keyed by n_gpus; values are [1-day, 1-week, 30-day] probability strings
 dur_labels = ["1 Day", "1 Week", "30 Days"]
 fp_data = {}
 for n_gpus in R.cluster_sizes:
@@ -370,6 +396,8 @@ $$ \text{Checkpoint Size} = N_\text{params} \times 16 \text{ bytes/param} $$ {#e
 ```{python}
 #| label: checkpoint-sizing-table
 #| echo: false
 # Goal: Format checkpoint sizes and write times for @tbl-checkpoint-size across 7B–1T models.
 # Exports: ckpt_data list of dicts with "label", "ckpt_gb", "write_time" keys
 ckpt_data = []
 for i, n_params in enumerate(R.model_sizes_params):
@@ -407,28 +435,50 @@ At frontier scale (175B+ parameters), checkpoint sizes reach the terabyte range.
 ```{python}
 #| label: worked-example-young-daly
 #| echo: false
 # ┌─────────────────────────────────────────────────────────────────────────────
 # │ YOUNG-DALY WORKED EXAMPLE
 # ├─────────────────────────────────────────────────────────────────────────────
 # │ Context: @sec-reliability-foundations-worked-example callout
 # │
 # │ Goal: Compute optimal checkpoint interval τ_opt for 175B model on 10K-GPU cluster;
 # │       show scaling to 20K GPUs.
 # │ Show: ~28 min optimal interval, ~X% checkpoint overhead, shorter interval at 20K GPUs.
 # │ How: calc_young_daly_interval(δ, MTBF_s) from R.ckpt_write_time_s and R.cluster_mtbf_10k_s.
 # │
 # │ Imports: mlsys.formulas (calc_young_daly_interval), mlsys.constants (GPUS_PER_HOST)
 # │ Exports: yd_mtbf_h_str, yd_delta_str, yd_tau_min_str, yd_overhead_str, tau_20k_min_str
 # └─────────────────────────────────────────────────────────────────────────────
-# All values already computed in ReliabilityFoundations
+class WorkedExampleYoungDaly:
-yd_mtbf_h = R.cluster_mtbf_10k
+    """Young-Daly optimal checkpoint interval for 175B model on 10K-GPU cluster."""
-yd_mtbf_s = R.cluster_mtbf_10k_s
+    # All values already computed in ReliabilityFoundations
-yd_delta = R.ckpt_write_time_s
+    yd_mtbf_h = R.cluster_mtbf_10k          # Quantity[hour]
-yd_tau_s = R.tau_opt_s
+    yd_mtbf_s = R.cluster_mtbf_10k_s        # raw float (seconds)
-yd_tau_min = R.tau_opt_min
+    yd_delta = R.ckpt_write_time_s           # raw float (seconds)
    yd_tau_s = R.tau_opt_s                   # Quantity[second]
    yd_tau_min = R.tau_opt_min               # raw float in minutes
-# Overhead from checkpointing alone
+    # Overhead from checkpointing alone
-yd_ckpt_overhead = (yd_delta / yd_tau_s) * 100
+    yd_ckpt_overhead = (yd_delta / yd_tau_s.m_as(ureg.second)) * 100
-# What if MTBF halves (20K GPUs)?
+    # What if MTBF halves (20K GPUs)?
-mtbf_20k_h = R.node_mtbf / (20_000 // GPUS_PER_HOST)
+    mtbf_20k_h = R.node_mtbf / (20_000 // GPUS_PER_HOST)  # Quantity[hour]
-mtbf_20k_s = mtbf_20k_h * SEC_PER_HOUR
+    mtbf_20k_s = mtbf_20k_h.m_as(ureg.second)              # raw float (seconds)
-tau_20k_s = calc_young_daly_interval(yd_delta, mtbf_20k_s)
+    tau_20k_s = calc_young_daly_interval(yd_delta, mtbf_20k_s)  # Quantity[second]
-tau_20k_min = tau_20k_s / SECONDS_PER_MINUTE
+    tau_20k_min = tau_20k_s.m_as(ureg.minute)               # raw float in minutes
-yd_mtbf_h_str = fmt(yd_mtbf_h, precision=2)
+    yd_mtbf_h_str = fmt(yd_mtbf_h.m_as(ureg.hour), precision=2)
-yd_delta_str = fmt(yd_delta, precision=1)
+    yd_delta_str = fmt(yd_delta, precision=1)
-yd_tau_min_str = fmt(yd_tau_min, precision=1)
+    yd_tau_min_str = fmt(yd_tau_min, precision=1)
-yd_overhead_str = fmt(yd_ckpt_overhead, precision=1)
+    yd_overhead_str = fmt(yd_ckpt_overhead, precision=1)
-tau_20k_min_str = fmt(tau_20k_min, precision=1)
+    tau_20k_min_str = fmt(tau_20k_min, precision=1)
 # ┌── EXPORTS (Bridge to Text) ─────────────────────────────────────────────────
 yd_mtbf_h_str   = WorkedExampleYoungDaly.yd_mtbf_h_str
 yd_delta_str    = WorkedExampleYoungDaly.yd_delta_str
 yd_tau_min_str  = WorkedExampleYoungDaly.yd_tau_min_str
 yd_overhead_str = WorkedExampleYoungDaly.yd_overhead_str
 tau_20k_min_str = WorkedExampleYoungDaly.tau_20k_min_str
 ```
 ::: {.callout-example title="Young-Daly: 175B Model on a 10,000-GPU Cluster"}
@@ -470,12 +520,14 @@ $$ T_\text{recovery} = T_\text{detect} + T_\text{reschedule} + T_\text{reload} +
 ```{python}
 #| label: recovery-anatomy-table
 #| echo: false
 # Goal: Format recovery phase durations for @tbl-recovery-anatomy.
 # Exports: t_detect_str, t_reschedule_str, t_reload_str, t_replay_str, t_total_str
 t_detect_str = f"{R.t_detect}"
 t_reschedule_str = f"{R.t_reschedule}"
 t_reload_str = fmt(R.t_reload_s, precision=1)
-t_replay_str = fmt(R.t_replay_s / SECONDS_PER_MINUTE, precision=1)
+t_replay_str = fmt(R.t_replay_s.m_as(ureg.minute), precision=1)
-t_total_str = fmt(R.t_recovery_total_s / SECONDS_PER_MINUTE, precision=1)
+t_total_str = fmt(R.t_recovery_total_s, precision=1)
 ```
 +----------------------------+---------------------------+-------------------------------------------------+
@@ -567,6 +619,8 @@ where $A$ is the availability of a single replica and $k$ is the number of repli
 ```{python}
 #| label: availability-stacking-table
 #| echo: false
 # Goal: Format availability, nines count, and annual downtime for @tbl-availability-stacking.
 # Exports: avail_data list of dicts with "k", "avail", "nines", "downtime" keys
 avail_data = []
 for k in R.avail_replicas:
--- a/book/quarto/contents/vol2/data_storage/data_storage.qmd
+++ b/book/quarto/contents/vol2/data_storage/data_storage.qmd
@@ -27,7 +27,8 @@ from mlsys.constants import (
    CLOUD_EGRESS_PER_GB, USD,
    STORAGE_COST_S3_STD, STORAGE_COST_GLACIER,
    STORAGE_COST_NVME_LOW, STORAGE_COST_NVME_HIGH,
-    Mparam, Bparam, TFLOPs, GFLOPs
+    Mparam, Bparam, TFLOPs, GFLOPs,
    watt
 )
 from mlsys.formatting import fmt, sci, check
@@ -77,13 +78,25 @@ Accelerators can compute faster than storage can feed them. A modern GPU process
 # ┌─────────────────────────────────────────────────────────────────────────────
 # │ STORAGE HIERARCHY AND MODEL SPECIFICATIONS
 # ├─────────────────────────────────────────────────────────────────────────────
-# │ Context: Used across the chapter for hierarchy tables and bottleneck analysis.
+# │ Context: @sec-data-storage storage hierarchy tables and I/O bottleneck
 # │          analysis paragraphs throughout the chapter.
 # │
-# │ Goal: Provide quantitative specs for hardware and lighthouse models.
+# │ Goal: Establish the six-tier storage hierarchy gap by computing H100 HBM
-# │ Show: The massive gap between HBM bandwidth and disk I/O.
+# │       bandwidth (H100_MEM_BW) vs NVMe sequential bandwidth (NVME_SEQUENTIAL_BW),
 # │       and estimate GPT-3 checkpoint write time (GPT3_PARAMS, FP16, at NVMe
 # │       vs network storage) to show the I/O bottleneck in fault tolerance.
 # │ Show: "3.35" TB/s H100 HBM vs "~7" GB/s NVMe — inline in the storage
 # │       hierarchy tier comparison and checkpoint I/O bottleneck paragraphs.
 # │ How: Direct .m_as() for each unit conversion; H100_TDP .m_as(watt).
 # │
-# │ Imports: mlsys.constants
+# │ Imports: mlsys.constants (A100_MEM_CAPACITY, H100_MEM_CAPACITY, H100_MEM_BW,
-# │ Exports: a100_mem, h100_bw_tbs, gpt3_params_b, resnet_params_m, etc.
+# │           H100_FLOPS_FP8_TENSOR, H100_FLOPS_FP16_TENSOR, H100_TDP,
 # │           GPT3_PARAMS, RESNET50_PARAMS, NVME_SEQUENTIAL_BW,
 # │           NVLINK_H100_BW, PCIE_GEN5_BW, GiB, TB, TFLOPs, GB, second,
 # │           watt, Bparam, Mparam)
 # │ Exports: a100_mem, h100_mem, h100_bw_tbs, h100_fp8_tflops, h100_fp16_tflops,
 # │          h100_tdp_w, gpt3_params_b, resnet_params_m, nvme_bw,
 # │          nvlink_bw_gbs, pcie5_bw_gbs
 # └─────────────────────────────────────────────────────────────────────────────
 import math
@@ -93,21 +106,21 @@ class StorageSetup:
    Namespace for global storage constants and specs.
    """
    # GPU specs
-    a100_mem = A100_MEM_CAPACITY.to(GiB).magnitude
+    a100_mem = A100_MEM_CAPACITY.m_as(GiB)
-    h100_mem = H100_MEM_CAPACITY.to(GiB).magnitude
+    h100_mem = H100_MEM_CAPACITY.m_as(GiB)
-    h100_bw = H100_MEM_BW.to(TB/second).magnitude
+    h100_bw = H100_MEM_BW.m_as(TB/second)
-    h100_fp8 = H100_FLOPS_FP8_TENSOR.to(TFLOPs/second).magnitude
+    h100_fp8 = H100_FLOPS_FP8_TENSOR.m_as(TFLOPs/second)
-    h100_fp16 = H100_FLOPS_FP16_TENSOR.to(TFLOPs/second).magnitude
+    h100_fp16 = H100_FLOPS_FP16_TENSOR.m_as(TFLOPs/second)
-    h100_tdp = H100_TDP.magnitude
+    h100_tdp = H100_TDP.m_as(watt)
    # Model specs
-    gpt3_params = GPT3_PARAMS.to(Bparam).magnitude
+    gpt3_params = GPT3_PARAMS.m_as(Bparam)
-    resnet_params = RESNET50_PARAMS.to(Mparam).magnitude
+    resnet_params = RESNET50_PARAMS.m_as(Mparam)
    # Storage & Interconnect
-    nvme_bw = NVME_SEQUENTIAL_BW.to(GB/second).magnitude
+    nvme_bw = NVME_SEQUENTIAL_BW.m_as(GB/second)
-    nvlink_bw = NVLINK_H100_BW.to(GB/second).magnitude
+    nvlink_bw = NVLINK_H100_BW.m_as(GB/second)
-    pcie5_bw = PCIE_GEN5_BW.to(GB/second).magnitude
+    pcie5_bw = PCIE_GEN5_BW.m_as(GB/second)
 # ┌── EXPORTS (Bridge to Text) ─────────────────────────────────────────────────
 a100_mem = f"{StorageSetup.a100_mem:.0f}"
@@ -125,11 +138,11 @@ nvlink_bw_gbs = f"{StorageSetup.nvlink_bw:.0f}"
 pcie5_bw_gbs = f"{StorageSetup.pcie5_bw:.0f}"
 # Storage
-nvme_bw = f"{NVME_SEQUENTIAL_BW.to(GB/second).magnitude:.1f}"
+nvme_bw = f"{NVME_SEQUENTIAL_BW.m_as(GB/second):.1f}"
 # Interconnect
-nvlink_bw_gbs = f"{NVLINK_H100_BW.to(GB/second).magnitude:.0f}"
+nvlink_bw_gbs = f"{NVLINK_H100_BW.m_as(GB/second):.0f}"
-pcie5_bw_gbs = f"{PCIE_GEN5_BW.to(GB/second).magnitude:.0f}"
+pcie5_bw_gbs = f"{PCIE_GEN5_BW.m_as(GB/second):.0f}"
 # ┌── P.I.C.O. ISOLATED SCENARIO ───────────────────────────────────────────────
 class StorageEconomics:
--- a/book/quarto/contents/vol2/fault_tolerance/fault_tolerance.qmd
+++ b/book/quarto/contents/vol2/fault_tolerance/fault_tolerance.qmd
@@ -40,25 +40,66 @@ A single GPU fails perhaps once per year. A thousand GPUs experience failures da
 :::
 ```{python}
 #| label: fault-tolerance-setup
 #| echo: false
 #| label: fault-tolerance-setup
 # ┌─────────────────────────────────────────────────────────────────────────────
 # │ FAULT TOLERANCE CHAPTER SETUP
 # ├─────────────────────────────────────────────────────────────────────────────
 # │ Context: Chapter-wide registry — values used in §Young-Daly Law
 # │   (@eq-young-daly-applied, line ~1957), §Sharded Checkpointing (line ~2289),
 # │   and §Recovery Cost (line ~2365).
 # │
 # │ Goal: Pre-compute GPT-3 checkpoint size (weights + Adam states) and
 # │   per-worker shard size for 1000-worker training, motivating the
 # │   checkpoint-interval formula and distributed checkpoint design.
 # │ Show: gpt3_ckpt_tb="2.1" TB (full checkpoint),
 # │   gpt3_shard_gb="2.1" GB (per-worker shard at 1000 workers) — inline in prose.
 # │ How: Multiply GPT3_PARAMS.m_as(param) by bytes-per-param for each state;
 # │   convert result pint Quantity with .m_as(TB) and .m_as(GB).
 # │
 # │ Imports: mlsys.constants (GPT3_PARAMS, param, byte, TB, GB, BILLION),
 # │   mlsys.formatting (fmt, sci)
 # │ Exports: gpt3_params_b, gpt3_ckpt_tb, gpt3_adam_tb, gpt3_shard_gb
 # │ Note: PERSISTENT — gpt3_ckpt_tb used in §Young-Daly (line ~1957),
 # │   §Sharded Checkpointing (line ~2289), §Recovery (line ~2365, ~2385);
 # │   gpt3_shard_gb used in §Sharded Checkpointing (line ~2289), §Recovery (~2371, ~2385).
 # └─────────────────────────────────────────────────────────────────────────────
 from mlsys.constants import *
 from mlsys.formatting import fmt, sci
-# GPT-3 model parameters
+# ┌── P.I.C.O. ISOLATED SCENARIO ───────────────────────────────────────────────
-gpt3_params_b = f"{GPT3_PARAMS.to(param).magnitude / BILLION:.0f}"
+class FaultToleranceSetup:
    """Namespace for GPT-3 checkpoint sizing and shard calculations."""
-# GPT-3 checkpoint size: weights (2 bytes FP16) + Adam m (4 bytes) + Adam v (4 bytes) = 12 bytes/param
+    # ┌── 1. PARAMETERS (Inputs) ──────────────────────────────────────────────
-gpt3_ckpt_bytes = GPT3_PARAMS.magnitude * 12 * byte
+    # GPT-3 checkpoint byte layout:
-gpt3_ckpt_tb = f"{gpt3_ckpt_bytes.to(TB).magnitude:.1f}"
+    #   weights (2 bytes FP16) + Adam m (4 bytes) + Adam v (4 bytes) = 12 bytes/param
    bytes_full_ckpt = 12   # bytes per param: weights + Adam m + v
    bytes_adam_only = 8    # bytes per param: Adam m + v only
    n_workers = 1000       # workers for shard size calculation
-# GPT-3 Adam optimizer state: m + v = 8 bytes/param
+    # ┌── 2. CALCULATION (The Physics) ────────────────────────────────────────
-gpt3_adam_bytes = GPT3_PARAMS.magnitude * 8 * byte
+    # Full checkpoint: weights + optimizer states
-gpt3_adam_tb = f"{gpt3_adam_bytes.to(TB).magnitude:.1f}"
+    gpt3_ckpt_bytes = GPT3_PARAMS.m_as(param) * bytes_full_ckpt * byte
-# Per-worker shard for 1000 workers
+    # Optimizer-only checkpoint: Adam m + v (no weights)
-gpt3_shard_gb = f"{gpt3_ckpt_bytes.to(GB).magnitude / 1000:.1f}"
+    gpt3_adam_bytes = GPT3_PARAMS.m_as(param) * bytes_adam_only * byte
    # ┌── 3. INVARIANTS (Guardrails) ──────────────────────────────────────────
    # No check() calls needed — values are monotone functions of constants.
    # ┌── 4. OUTPUTS (Formatting) ─────────────────────────────────────────────
    gpt3_params_b = f"{GPT3_PARAMS.m_as(param) / BILLION:.0f}"
    gpt3_ckpt_tb = f"{gpt3_ckpt_bytes.m_as(TB):.1f}"
    gpt3_adam_tb = f"{gpt3_adam_bytes.m_as(TB):.1f}"
    gpt3_shard_gb = f"{gpt3_ckpt_bytes.m_as(GB) / n_workers:.1f}"
 # ┌── EXPORTS (Bridge to Text) ─────────────────────────────────────────────────
 gpt3_params_b = FaultToleranceSetup.gpt3_params_b
 gpt3_ckpt_tb = FaultToleranceSetup.gpt3_ckpt_tb
 gpt3_adam_tb = FaultToleranceSetup.gpt3_adam_tb
 gpt3_shard_gb = FaultToleranceSetup.gpt3_shard_gb
 ```
 ## Failure Analysis at Scale {#sec-fault-tolerance-reliability-reliability-failure-analysis-scale-6b4b}
@@ -2123,45 +2164,88 @@ Imagine 10,000 GPUs, each holding a 10 GB shard of the model state, simultaneous
 While @tbl-checkpoint-overhead-by-model suggests modest overhead percentages, real deployments often encounter checkpoint times far exceeding these theoretical estimates. Diagnosing such discrepancies requires examining the full system stack.
 ```{python}
 #| label: checkpoint-debug-calc
 #| echo: false
 #| label: checkpoint-debug-calc
 # ┌─────────────────────────────────────────────────────────────────────────────
 # │ CHECKPOINT DEBUG CALCULATION
 # ├─────────────────────────────────────────────────────────────────────────────
 # │ Context: "Debugging Checkpoint Overhead" callout in §Checkpoint Overhead.
 # │
 # │ Goal: Diagnose why a 70B model checkpoint takes 10 minutes instead of
 # │   2 minutes on an NFS-backed cluster, by computing theoretical bandwidth
 # │   limits and contention-induced effective throughput per node.
 # │ Show: total_ckpt_gb_str="420" GB, nfs_gbs_str="1.25" GB/s,
 # │   min_write_min_str="5.6" min, per_node_mbs_str="20" MB/s,
 # │   serialized_min_str="5,600" min — inline in the Fleet Stack diagnosis.
 # │ How: Compute weights + optimizer state size in GB; derive NFS bandwidth in
 # │   GB/s (10 Gbps / 8); calculate min write time and per-node bandwidth
 # │   under contention from 64 concurrent nodes.
 # │
 # │ Imports: (none — pure Python arithmetic, no pint quantities)
 # │ Exports: weights_gb_str, optimizer_gb_str, total_ckpt_gb_str, nfs_gbs_str,
 # │   min_write_s_str, min_write_min_str, per_node_mbs_str, serialized_min_str,
 # │   extended_weeks_str, extra_cost_k_str
 # └─────────────────────────────────────────────────────────────────────────────
-# 70B model checkpoint sizing
+class CheckpointDebugCalc:
-model_params_b = 70  # billions
+    """Diagnose 70B checkpoint overhead on NFS-backed cluster."""
 bytes_per_param = 2  # BF16
 weights_gb = model_params_b * bytes_per_param  # 140 GB
 optimizer_gb = weights_gb * 2  # Adam first + second moments
 total_ckpt_gb = weights_gb + optimizer_gb  # 420 GB
-# Storage constraints
+    # ┌── 1. PARAMETERS (Inputs) ──────────────────────────────────────────────
-nfs_gbps = 10  # Gbps network
+    model_params_b = 70     # 70B parameter model
-nfs_gbs = nfs_gbps / 8  # 1.25 GB/s
+    bytes_per_param = 2     # BF16 weights
-min_write_s = total_ckpt_gb / nfs_gbs  # seconds
+    nfs_gbps = 10           # NFS network attachment bandwidth in Gbps
-min_write_min = min_write_s / 60  # minutes
+    n_nodes = 64            # nodes writing simultaneously
    overhead_pct = 30       # observed training throughput loss %
    base_weeks = 2          # baseline training duration (weeks)
    extra_cost_k = 500      # additional cost from extended training ($K)
-# Contention analysis
+    # ┌── 2. CALCULATION (The Physics) ────────────────────────────────────────
-n_nodes = 64
+    # Model state sizing
-per_node_gbs = nfs_gbs / n_nodes  # GB/s per node
+    weights_gb = model_params_b * bytes_per_param          # 140 GB
-per_node_mbs = per_node_gbs * 1000  # MB/s per node
+    optimizer_gb = weights_gb * 2                           # Adam m + v moments
-serialized_min = (total_ckpt_gb / per_node_gbs) / 60
+    total_ckpt_gb = weights_gb + optimizer_gb               # 420 GB
-# Training extension
+    # Storage bandwidth limits
-overhead_pct = 30
+    nfs_gbs = nfs_gbps / 8                                 # 1.25 GB/s
-base_weeks = 2
+    min_write_s = total_ckpt_gb / nfs_gbs                  # theoretical minimum seconds
-extended_weeks = base_weeks * (1 + overhead_pct / 100)
+    min_write_min = min_write_s / 60                        # convert to minutes
 extra_cost_k = 500  # $K
-# Format strings
+    # Contention: 64 nodes sharing the NFS bandwidth
-weights_gb_str = f"{weights_gb:.0f}"
+    per_node_gbs = nfs_gbs / n_nodes                        # GB/s per node under contention
-optimizer_gb_str = f"{optimizer_gb:.0f}"
+    per_node_mbs = per_node_gbs * 1000                      # MB/s per node
-total_ckpt_gb_str = f"{total_ckpt_gb:.0f}"
+    serialized_min = (total_ckpt_gb / per_node_gbs) / 60   # worst-case serialized write time
-nfs_gbs_str = f"{nfs_gbs}"
+
-min_write_s_str = f"{min_write_s:.0f}"
+    # Training schedule impact
-min_write_min_str = f"{min_write_min:.1f}"
+    extended_weeks = base_weeks * (1 + overhead_pct / 100)
-per_node_mbs_str = f"{per_node_mbs:.0f}"
+
-serialized_min_str = f"{serialized_min:.0f}"
+    # ┌── 3. INVARIANTS (Guardrails) ──────────────────────────────────────────
-extended_weeks_str = f"{extended_weeks:.1f}"
+    assert min_write_min < 10, "Theoretical minimum must be less than observed 10 minutes"
-extra_cost_k_str = f"{extra_cost_k}"
+    assert serialized_min > min_write_min, "Contention time must exceed theoretical minimum"
    # ┌── 4. OUTPUTS (Formatting) ─────────────────────────────────────────────
    weights_gb_str     = f"{weights_gb:.0f}"
    optimizer_gb_str   = f"{optimizer_gb:.0f}"
    total_ckpt_gb_str  = f"{total_ckpt_gb:.0f}"
    nfs_gbs_str        = f"{nfs_gbs}"
    min_write_s_str    = f"{min_write_s:.0f}"
    min_write_min_str  = f"{min_write_min:.1f}"
    per_node_mbs_str   = f"{per_node_mbs:.0f}"
    serialized_min_str = f"{serialized_min:.0f}"
    extended_weeks_str = f"{extended_weeks:.1f}"
    extra_cost_k_str   = f"{extra_cost_k}"
 # ┌── EXPORTS (Bridge to Text) ─────────────────────────────────────────────────
 weights_gb_str     = CheckpointDebugCalc.weights_gb_str
 optimizer_gb_str   = CheckpointDebugCalc.optimizer_gb_str
 total_ckpt_gb_str  = CheckpointDebugCalc.total_ckpt_gb_str
 nfs_gbs_str        = CheckpointDebugCalc.nfs_gbs_str
 min_write_s_str    = CheckpointDebugCalc.min_write_s_str
 min_write_min_str  = CheckpointDebugCalc.min_write_min_str
 per_node_gbs       = CheckpointDebugCalc.per_node_gbs
 per_node_mbs_str   = CheckpointDebugCalc.per_node_mbs_str
 serialized_min_str = CheckpointDebugCalc.serialized_min_str
 extended_weeks_str = CheckpointDebugCalc.extended_weeks_str
 extra_cost_k_str   = CheckpointDebugCalc.extra_cost_k_str
 ```
 ::: {.callout-example title="Debugging Checkpoint Overhead"}