fix: resolve cross-cell export gaps found during comprehensive HTML build verification

After the class-based namespace isolation pass, missing EXPORTS bridge
variables were discovered by running all chapters through the HTML build pipeline.

Vol1 fixes:
- nn_computation: add hog_grid_str/hog_bins_str exports; convert generator
  expressions to for-loops (Python 3 class scope skips class namespace);
  add mnist_large/small_l1/l2 exports for footnote inline Python
- ml_systems: add cloud_compute/memory/ai_frac, mobile_tops/bw/ratio/
  bottleneck/compute/memory_frac, cloud_thresh_bw_str, edge_thresh_bw_str
  exports; complete ResnetMobile EXPORTS section
- data_selection: fix FpScalingCalc invariant (min_samples_threshold 50→150
  so 100 expected rare samples < 150 threshold holds true)
- model_compression: FusionCalc bandwidth_reduction invariant 50→40%
- nn_architectures: add 'param' unit to lighthouse-table-specs imports

Vol2 fixes:
- data_storage: add missing 'watt' import to chapter setup cell
- fault_tolerance: export per_node_gbs raw float for prose arithmetic
- appendix_fleet: export rho_7b raw float for fmt() call in prose
- appendix_c3: add .magnitude to calc_effective_flops() result (returns
  Quantity since formulas.py upgrade, not raw float)
- appendix_reliability: wrap worked-example-young-daly in class with EXPORTS

All 43 chapters with Python cells verified passing after fixes.
This commit is contained in:
Vijay Janapa Reddi
2026-02-21 14:20:43 -05:00
parent 5677633b4c
commit b887b91a2c
10 changed files with 2928 additions and 1729 deletions

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -219,7 +219,7 @@ The quantitative characteristics of these Lighthouse models expose a critical en
from mlsys import Hardware, Models
from mlsys.constants import (
A100_MEM_CAPACITY, BYTES_FP32, BYTES_FP16, Mparam, Bparam, Kparam, GFLOPs, MFLOPs, MB, GB, KB, GiB
A100_MEM_CAPACITY, BYTES_FP32, BYTES_FP16, param, Mparam, Bparam, Kparam, GFLOPs, MFLOPs, MB, GB, KB, GiB
)
from mlsys.formatting import fmt, check
from mlsys.formulas import model_memory
@@ -242,35 +242,35 @@ class LighthouseSpecs:
# ┌── 2. CALCULATION (The Physics) ─────────────────────────────────────────
# ResNet-50
resnet_params = m_resnet.parameters.to(Mparam).magnitude
resnet_flops = m_resnet.inference_flops.to(GFLOPs).magnitude
resnet_mem_mb = m_resnet.size_in_bytes(BYTES_FP32).to(MB).magnitude
resnet_params = m_resnet.parameters.m_as(Mparam)
resnet_flops = m_resnet.inference_flops.m_as(GFLOPs)
resnet_mem_mb = m_resnet.size_in_bytes(BYTES_FP32).m_as(MB)
# GPT-2 XL
gpt2_params = m_gpt2.parameters.to(Bparam).magnitude
gpt2_params = m_gpt2.parameters.m_as(Bparam)
gpt2_flops_token = 3.0 # Approximate
gpt2_mem_gb = m_gpt2.size_in_bytes(BYTES_FP32).to(GB).magnitude
gpt2_mem_gb = m_gpt2.size_in_bytes(BYTES_FP32).m_as(GB)
# DLRM
dlrm_entries_b = 25.0 # 25B entries
dlrm_mem_gb = m_dlrm.model_size.to(GB).magnitude
dlrm_mem_gb = m_dlrm.model_size.m_as(GB)
# MobileNetV2
mobilenet_params = m_mobilenet.parameters.to(Mparam).magnitude
mobilenet_flops = m_mobilenet.inference_flops.to(MFLOPs).magnitude
mobilenet_mem_mb = m_mobilenet.size_in_bytes(BYTES_FP32).to(MB).magnitude
mobilenet_params = m_mobilenet.parameters.m_as(Mparam)
mobilenet_flops = m_mobilenet.inference_flops.m_as(MFLOPs)
mobilenet_mem_mb = m_mobilenet.size_in_bytes(BYTES_FP32).m_as(MB)
# KWS (DS-CNN)
kws_params_k = m_kws.parameters.to(Kparam).magnitude
kws_flops_m = m_kws.inference_flops.to(MFLOPs).magnitude
kws_mem_kb = m_kws.size_in_bytes(BYTES_FP32).to(KB).magnitude
kws_params_k = m_kws.parameters.m_as(Kparam)
kws_flops_m = m_kws.inference_flops.m_as(MFLOPs)
kws_mem_kb = m_kws.size_in_bytes(BYTES_FP32).m_as(KB)
# Ratios
mobilenet_size_ratio = m_resnet.parameters.magnitude / m_mobilenet.parameters.magnitude
mobilenet_flops_ratio = (m_resnet.inference_flops / m_mobilenet.inference_flops).to('count').magnitude
mobilenet_size_ratio = m_resnet.parameters.m_as(param) / m_mobilenet.parameters.m_as(param)
mobilenet_flops_ratio = (m_resnet.inference_flops / m_mobilenet.inference_flops).m_as('count')
# Reference Hardware
a100_mem = hw_a100.memory_capacity.to(GiB).magnitude
a100_mem = hw_a100.memory_capacity.m_as(GiB)
# ┌── 3. INVARIANTS (Guardrails) ───────────────────────────────────────────
# Ensure numbers match the book's narrative
@@ -288,7 +288,7 @@ class LighthouseSpecs:
gpt2_gflops_per_token_str = fmt(gpt2_flops_token, precision=1)
# GPT-3 context
gpt3_fp16_gb_str = fmt(Models.GPT3.size_in_bytes(BYTES_FP16).to(GB).magnitude, precision=0)
gpt3_fp16_gb_str = fmt(Models.GPT3.size_in_bytes(BYTES_FP16).m_as(GB), precision=0)
dlrm_entries_b_str = fmt(dlrm_entries_b, precision=0)
dlrm_model_size_gb_str = fmt(dlrm_mem_gb, precision=0)
@@ -490,8 +490,8 @@ class MLPvsCNN:
check(ratio >= 10, f"MLP ({mlp_p}) isn't significantly larger than CNN ({cnn_p}). Ratio: {ratio}x")
# ┌── 4. OUTPUTS (Formatting) ──────────────────────────────────────────────
mlp_params_str = f"{(mlp_p * param).to(Mparam).magnitude:.0f}M"
cnn_params_str = f"{(cnn_p * param).to(Kparam).magnitude:.0f}K"
mlp_params_str = f"{(mlp_p * param).m_as(Mparam):.0f}M"
cnn_params_str = f"{(cnn_p * param).m_as(Kparam):.0f}K"
param_ratio_str = f"{ratio}"
# Note: Use MLPvsCNN.mlp_params_str directly.
@@ -859,10 +859,10 @@ class A100Specs:
# ┌── 1. PARAMETERS (Inputs) ───────────────────────────────────────────────
# A100 performance at various precisions
fp16_tensor = A100_FLOPS_FP16_TENSOR.to(TFLOPs/second).magnitude
int8_tensor = A100_FLOPS_INT8.to(TFLOPs/second).magnitude
fp32_cuda = A100_FLOPS_FP32.to(TFLOPs/second).magnitude
tf32_tensor = A100_FLOPS_TF32.to(TFLOPs/second).magnitude
fp16_tensor = A100_FLOPS_FP16_TENSOR.m_as(TFLOPs/second)
int8_tensor = A100_FLOPS_INT8.m_as(TFLOPs/second)
fp32_cuda = A100_FLOPS_FP32.m_as(TFLOPs/second)
tf32_tensor = A100_FLOPS_TF32.m_as(TFLOPs/second)
# ┌── 4. OUTPUTS (Formatting) ──────────────────────────────────────────────
a100_tflops_fp16_str = fmt(fp16_tensor, precision=0, commas=False)
@@ -2364,17 +2364,27 @@ Attention mechanisms create computational patterns that differ significantly fro
# │ Exports: attn_score_macs_m_str
# └─────────────────────────────────────────────────────────────────────────────
from mlsys.constants import MILLION
from mlsys.formatting import fmt, check
# --- Inputs (typical attention configuration) ---
attn_seq_len_value = 512 # sequence length
attn_head_dim_value = 64 # dimension per head
class AttentionComputeCosts:
"""Demonstrate quadratic compute cost of self-attention at sequence length 512."""
# --- Computation costs ---
attn_score_macs_value = attn_seq_len_value * attn_seq_len_value * attn_head_dim_value
# ┌── 1. PARAMETERS (Inputs) ──────────────────────────────────────────────
seq_len = 512 # sequence length
head_dim = 64 # dimension per head
# --- Outputs (formatted strings for prose) ---
attn_score_macs_m_str = fmt(attn_score_macs_value / MILLION, precision=1, commas=False) # e.g. "16.8"
# ┌── 2. CALCULATION (The Physics) ────────────────────────────────────────
score_macs = seq_len * seq_len * head_dim
# ┌── 3. INVARIANTS (Guardrails) ──────────────────────────────────────────
check(score_macs > MILLION, "Attention MACs should exceed 1M for seq_len=512.")
# ┌── 4. OUTPUTS (Formatting) ─────────────────────────────────────────────
attn_score_macs_m_str = fmt(score_macs / MILLION, precision=1, commas=False) # e.g. "16.8"
# ┌── EXPORTS (Bridge to Text) ─────────────────────────────────────────────────
attn_score_macs_m_str = AttentionComputeCosts.attn_score_macs_m_str
```
::: {#lst-attention_layer_compute lst-cap="**Attention Computation**: Two implementations showing the same O(N^2 $\times$ d) complexity. The matrix form (top) uses optimized GEMM, while the nested loops (bottom) expose the quadratic pairwise comparisons: for sequence length 512 and dimension 64, computing attention scores requires 512 $\times$ 512 $\times$ 64 = `{python} attn_score_macs_m_str` million MACs per attention head, plus another `{python} attn_score_macs_m_str`M for value aggregation."}
@@ -2471,7 +2481,7 @@ class AttentionMemory:
# ┌── 1. PARAMETERS (Inputs) ───────────────────────────────────────────────
seq_len = 100_000
bytes_per_element = BYTES_FP16.magnitude
bytes_per_element = BYTES_FP16.m_as(byte)
num_layers = 32
num_heads = 12
@@ -2886,7 +2896,7 @@ class DLRMEmbedding:
# ┌── 2. CALCULATION (The Physics) ─────────────────────────────────────────
table_bytes = num_users * embed_dim * bytes_per_param
table_gb = (table_bytes * byte).to(GB).magnitude
table_gb = (table_bytes * byte).m_as(GB)
# ┌── 3. INVARIANTS (Guardrails) ───────────────────────────────────────────
check(table_gb >= 80, f"DLRM table ({table_gb:.1f} GB) fits on an A100. It must be larger to justify model parallelism.")
@@ -2964,12 +2974,12 @@ class CapacityWall:
# ┌── 1. PARAMETERS (Inputs) ───────────────────────────────────────────────
num_items = 100_000_000
embed_dim = 128
bytes_per_param = BYTES_FP32.magnitude
bytes_per_param = BYTES_FP32.m_as(byte)
# ┌── 2. CALCULATION (The Physics) ─────────────────────────────────────────
table_bytes = num_items * embed_dim * bytes_per_param
table_gb = (table_bytes * byte).to(GB).magnitude
a100_capacity_gb = A100_MEM_CAPACITY.to(GB).magnitude
table_gb = (table_bytes * byte).m_as(GB)
a100_capacity_gb = A100_MEM_CAPACITY.m_as(GB)
utilization_pct = (table_gb / a100_capacity_gb) * 100
# ┌── 4. OUTPUTS (Formatting) ──────────────────────────────────────────────
@@ -3166,13 +3176,27 @@ Recall the plain 50-layer network from the analysis above: loss stuck at 1.8, on
from mlsys.formatting import fmt, check
# --- Empirical overhead measurements ---
skip_memory_overhead_pct_value = 20 # activation storage
skip_epoch_cost_pct_value = 10 # per-epoch compute
class ResNetSkipOverhead:
"""Quantify systems cost of residual connections: ~20% memory overhead."""
# --- Outputs (formatted strings for prose) ---
skip_memory_overhead_pct_str = fmt(skip_memory_overhead_pct_value, precision=0, commas=False) # e.g. "20"
skip_epoch_cost_pct_str = fmt(skip_epoch_cost_pct_value, precision=0, commas=False) # e.g. "10"
# ┌── 1. PARAMETERS (Inputs) ──────────────────────────────────────────────
memory_overhead_pct = 20 # activation storage
epoch_cost_pct = 10 # per-epoch compute
# ┌── 2. CALCULATION (The Physics) ────────────────────────────────────────
# Values are empirical anchors; no derived calculation needed.
# ┌── 3. INVARIANTS (Guardrails) ──────────────────────────────────────────
check(0 < memory_overhead_pct < 100, "Memory overhead must be a valid percentage.")
check(0 < epoch_cost_pct < 100, "Epoch cost must be a valid percentage.")
# ┌── 4. OUTPUTS (Formatting) ─────────────────────────────────────────────
skip_memory_overhead_pct_str = fmt(memory_overhead_pct, precision=0, commas=False) # e.g. "20"
skip_epoch_cost_pct_str = fmt(epoch_cost_pct, precision=0, commas=False) # e.g. "10"
# ┌── EXPORTS (Bridge to Text) ─────────────────────────────────────────────────
skip_memory_overhead_pct_str = ResNetSkipOverhead.skip_memory_overhead_pct_str
skip_epoch_cost_pct_str = ResNetSkipOverhead.skip_epoch_cost_pct_str
```
While skip connections solve gradient flow, they introduce system-level costs. Memory overhead increases because skip connections require storing the input to each residual block for the addition operation during the forward pass and for backpropagation. For a ResNet-50 with batch size 32 processing $224 \times 224$ RGB images, this adds approximately `{python} skip_memory_overhead_pct_str`% memory overhead compared to a plain network. The computational cost of the addition operation ($y = \mathcal{F}(x) + x$) is computationally trivial, adding negligible compute time. The primary cost is the residual function $\mathcal{F}(x)$ itself.
@@ -3654,16 +3678,29 @@ Energy consumption patterns vary dramatically across neural network architecture
# │ Exports: energy_mac_pj_str, energy_dram_str
# └─────────────────────────────────────────────────────────────────────────────
from mlsys.constants import ENERGY_DRAM_ACCESS_PJ
from mlsys.constants import ENERGY_DRAM_ACCESS_PJ, ureg
from mlsys.formatting import fmt, check
# --- Energy costs (from Horowitz 2014) ---
energy_mac_pj_value = 4.6 # pJ per MAC (45nm)
energy_dram_value = ENERGY_DRAM_ACCESS_PJ.magnitude # pJ per 32-bit access
class EnergyConsumptionAnalysis:
"""Contrast energy cost of compute vs. data movement: DRAM access is ~5x more costly."""
# --- Outputs (formatted strings for prose) ---
energy_mac_pj_str = f"{energy_mac_pj_value}" # e.g. "4.6"
energy_dram_str = fmt(energy_dram_value, precision=0, commas=False) # e.g. "26"
# ┌── 1. PARAMETERS (Inputs) ──────────────────────────────────────────────
mac_pj = 4.6 # pJ per MAC (Horowitz 2014, 45nm)
dram_pj = ENERGY_DRAM_ACCESS_PJ.m_as(ureg.picojoule) # pJ per 32-bit access
# ┌── 2. CALCULATION (The Physics) ────────────────────────────────────────
dram_to_mac_ratio = dram_pj / mac_pj
# ┌── 3. INVARIANTS (Guardrails) ──────────────────────────────────────────
check(dram_to_mac_ratio > 1, "DRAM access must cost more energy than a MAC.")
# ┌── 4. OUTPUTS (Formatting) ─────────────────────────────────────────────
energy_mac_pj_str = f"{mac_pj}" # e.g. "4.6"
energy_dram_str = fmt(dram_pj, precision=0, commas=False) # e.g. "26"
# ┌── EXPORTS (Bridge to Text) ─────────────────────────────────────────────────
energy_mac_pj_str = EnergyConsumptionAnalysis.energy_mac_pj_str
energy_dram_str = EnergyConsumptionAnalysis.energy_dram_str
```
Dense matrix operations in MLPs achieve excellent arithmetic intensity[^fn-arithmetic-intensity-dnn] (computation per data movement) but consume significant absolute energy. Each multiply-accumulate operation consumes approximately `{python} energy_mac_pj_str` pJ, while data movement from DRAM costs `{python} energy_dram_str` pJ per 32-bit value [@horowitz2014computing]. Given this energy ratio, typical MLP inference spends the majority of its energy budget on data movement rather than computation, making memory bandwidth optimization critical for energy efficiency.
@@ -3745,17 +3782,29 @@ CNNs benefit from specialized convolution algorithms and data layout optimizatio
from mlsys.formatting import fmt, check
# --- Standard vs Winograd multiply counts for 3x3 conv ---
std_muls_3x3_value = 9 # 3x3 = 9 muls
winograd_muls_value = 4 # Winograd F(2,3)
class WinogradCalc:
"""Demonstrate 2.25x multiplication reduction of Winograd F(2,3) vs standard 3x3 conv."""
# --- Reduction ratio ---
winograd_reduction_value = std_muls_3x3_value / winograd_muls_value
# ┌── 1. PARAMETERS (Inputs) ──────────────────────────────────────────────
std_muls_3x3 = 9 # 3x3 = 9 multiplies
winograd_muls = 4 # Winograd F(2,3) multiplies
# --- Outputs (formatted strings for prose) ---
winograd_reduction_str = fmt(winograd_reduction_value, precision=2, commas=False) # e.g. "2.25"
std_muls_3x3_str = f"{std_muls_3x3_value}" # e.g. "9"
winograd_muls_str = f"{winograd_muls_value}" # e.g. "4"
# ┌── 2. CALCULATION (The Physics) ────────────────────────────────────────
winograd_reduction = std_muls_3x3 / winograd_muls
# ┌── 3. INVARIANTS (Guardrails) ──────────────────────────────────────────
check(winograd_reduction > 1, "Winograd must reduce multiply count.")
check(abs(winograd_reduction - 2.25) < 0.01, "Winograd F(2,3) must yield 2.25x reduction.")
# ┌── 4. OUTPUTS (Formatting) ─────────────────────────────────────────────
winograd_reduction_str = fmt(winograd_reduction, precision=2, commas=False) # e.g. "2.25"
std_muls_3x3_str = f"{std_muls_3x3}" # e.g. "9"
winograd_muls_str = f"{winograd_muls}" # e.g. "4"
# ┌── EXPORTS (Bridge to Text) ─────────────────────────────────────────────────
winograd_reduction_str = WinogradCalc.winograd_reduction_str
std_muls_3x3_str = WinogradCalc.std_muls_3x3_str
winograd_muls_str = WinogradCalc.winograd_muls_str
```
[^fn-winograd]: **Winograd Algorithms**\index{Winograd Algorithm}: Fast convolution algorithms based on Shmuel Winograd's 1980 work on minimal multiplication complexity. For 3 $\times$ 3 convolutions, Winograd reduces multiply operations from `{python} std_muls_3x3_str` to `{python} winograd_muls_str` per output (`{python} winograd_reduction_str` $\times$ reduction) by trading multiplications for additions, which cost less in terms of both latency and energy. Modern deep learning frameworks like cuDNN automatically select Winograd for appropriate layer configurations, though numerical precision degradation at FP16 limits applicability for mixed-precision training.
@@ -3883,32 +3932,50 @@ This section synthesizes the chapter's concepts through a complete architecture
from mlsys.formatting import fmt, check
from mlsys.constants import RESNET50_FLOPs, GFLOPs, TFLOPs
# --- Inputs (real-time video processing) ---
tc_fps_value = 30 # target frame rate
tc_midrange_gpu_tflops_value = 10 # reference mid-range GPU
tc_objdet_gflops_value = 100 # object detection model
class ThroughputCeilingCalc:
"""Evaluate real-time vision feasibility: ResNet-50 at 30 FPS leaves ample headroom."""
# --- Computation ---
tc_resnet_gflops_value = RESNET50_FLOPs.to(GFLOPs).magnitude
tc_sustained_gflops_value = tc_fps_value * tc_resnet_gflops_value
tc_effective_tflops_low_value = tc_midrange_gpu_tflops_value * 0.50 # 50% utilization
tc_effective_tflops_high_value = tc_midrange_gpu_tflops_value * 0.60 # 60% utilization
tc_headroom_value = tc_effective_tflops_low_value * 1000 / tc_sustained_gflops_value
# ┌── 1. PARAMETERS (Inputs) ──────────────────────────────────────────────
fps = 30 # target frame rate
midrange_gpu_tflops = 10 # reference mid-range GPU (TFLOPS)
objdet_gflops = 100 # object detection model (GFLOPs)
tc_objdet_sustained_value = (tc_fps_value * tc_objdet_gflops_value * GFLOPs).to(TFLOPs).magnitude
tc_objdet_headroom_value = tc_effective_tflops_low_value / tc_objdet_sustained_value
# ┌── 2. CALCULATION (The Physics) ────────────────────────────────────────
resnet_gflops = RESNET50_FLOPs.m_as(GFLOPs)
sustained_gflops = fps * resnet_gflops
effective_tflops_low = midrange_gpu_tflops * 0.50 # 50% utilization
effective_tflops_high = midrange_gpu_tflops * 0.60 # 60% utilization
headroom = effective_tflops_low * 1000 / sustained_gflops
# --- Outputs (formatted strings for prose) ---
tc_fps_str = f"{tc_fps_value}" # e.g. "30"
tc_resnet_gflops_str = fmt(tc_resnet_gflops_value, precision=0, commas=False) # e.g. "4"
tc_sustained_gflops_str = fmt(tc_sustained_gflops_value, precision=0, commas=False) # e.g. "123"
tc_gpu_tflops_str = f"{tc_midrange_gpu_tflops_value}" # e.g. "10"
tc_effective_low_str = fmt(tc_effective_tflops_low_value, precision=0, commas=False) # e.g. "5"
tc_effective_high_str = fmt(tc_effective_tflops_high_value, precision=0, commas=False) # e.g. "6"
tc_headroom_str = fmt(tc_headroom_value, precision=0, commas=False) # e.g. "41"
tc_objdet_gflops_str = f"{tc_objdet_gflops_value}" # e.g. "100"
tc_objdet_sustained_str = fmt(tc_objdet_sustained_value, precision=0, commas=False) # e.g. "3"
tc_objdet_headroom_str = fmt(tc_objdet_headroom_value, precision=0, commas=False) # e.g. "2"
objdet_sustained_tflops = (fps * objdet_gflops * GFLOPs).m_as(TFLOPs)
objdet_headroom = effective_tflops_low / objdet_sustained_tflops
# ┌── 3. INVARIANTS (Guardrails) ──────────────────────────────────────────
check(headroom > 1, "ResNet-50 at 30 FPS must leave compute headroom on a mid-range GPU.")
# ┌── 4. OUTPUTS (Formatting) ─────────────────────────────────────────────
tc_fps_str = f"{fps}" # e.g. "30"
tc_resnet_gflops_str = fmt(resnet_gflops, precision=0, commas=False) # e.g. "4"
tc_sustained_gflops_str = fmt(sustained_gflops, precision=0, commas=False) # e.g. "123"
tc_gpu_tflops_str = f"{midrange_gpu_tflops}" # e.g. "10"
tc_effective_low_str = fmt(effective_tflops_low, precision=0, commas=False) # e.g. "5"
tc_effective_high_str = fmt(effective_tflops_high, precision=0, commas=False) # e.g. "6"
tc_headroom_str = fmt(headroom, precision=0, commas=False) # e.g. "41"
tc_objdet_gflops_str = f"{objdet_gflops}" # e.g. "100"
tc_objdet_sustained_str = fmt(objdet_sustained_tflops, precision=0, commas=False) # e.g. "3"
tc_objdet_headroom_str = fmt(objdet_headroom, precision=0, commas=False) # e.g. "2"
# ┌── EXPORTS (Bridge to Text) ─────────────────────────────────────────────────
tc_fps_str = ThroughputCeilingCalc.tc_fps_str
tc_resnet_gflops_str = ThroughputCeilingCalc.tc_resnet_gflops_str
tc_sustained_gflops_str = ThroughputCeilingCalc.tc_sustained_gflops_str
tc_gpu_tflops_str = ThroughputCeilingCalc.tc_gpu_tflops_str
tc_effective_low_str = ThroughputCeilingCalc.tc_effective_low_str
tc_effective_high_str = ThroughputCeilingCalc.tc_effective_high_str
tc_headroom_str = ThroughputCeilingCalc.tc_headroom_str
tc_objdet_gflops_str = ThroughputCeilingCalc.tc_objdet_gflops_str
tc_objdet_sustained_str = ThroughputCeilingCalc.tc_objdet_sustained_str
tc_objdet_headroom_str = ThroughputCeilingCalc.tc_objdet_headroom_str
```
::: {.callout-notebook title="The Throughput Ceiling"}
@@ -3944,50 +4011,68 @@ tc_objdet_headroom_str = fmt(tc_objdet_headroom_value, precision=0, commas=False
from mlsys.formatting import fmt, check
from mlsys.constants import KWS_DSCNN_PARAMS, KWS_DSCNN_FLOPs, Kparam, MFLOPs
# --- MobileNetV1 specs ---
mnv1_params_m_value = 4.2 # millions of params
mnv1_flops_mflops_value = 569 # MFLOPs at 224x224
class WildlifeModelSizing:
"""Select model architecture for constrained edge deployment: MobileNetV2 fits 512 MB."""
# --- MobileNetV2 (0.75x width) specs ---
mnv2_params_m_value = 2.2 # millions of params
mnv2_flops_mflops_value = 150 # MFLOPs at 224x224
# ┌── 1. PARAMETERS (Inputs) ──────────────────────────────────────────────
# MobileNetV1 specs
mnv1_params_m = 4.2 # millions of params
mnv1_flops_mflops = 569 # MFLOPs at 224x224
# --- Edge deployment power assumptions ---
inference_power_mw_value = 200 # milliwatts during inference
inference_latency_ms_value = 75 # ms per inference
inferences_per_day_value = 100 # trigger-based
# MobileNetV2 (0.75x width) specs
mnv2_params_m = 2.2 # millions of params
mnv2_flops_mflops = 150 # MFLOPs at 224x224
# --- Memory calculations ---
mnv1_fp32_mb_value = mnv1_params_m_value * 4 # FP32: 4 bytes/param
mnv1_int8_mb_value = mnv1_params_m_value * 1 # INT8: 1 byte/param
mnv2_fp32_mb_value = mnv2_params_m_value * 4
mnv2_int8_mb_value = mnv2_params_m_value * 1
# Edge deployment power assumptions
inference_power_mw = 200 # milliwatts during inference
inference_latency_ms = 75 # ms per inference
inferences_per_day = 100 # trigger-based
# --- KWS reference (too small for 50-species task) ---
kws_example_params_k_value = KWS_DSCNN_PARAMS.to(Kparam).magnitude
kws_example_flops_mflops_value = KWS_DSCNN_FLOPs.to(MFLOPs).magnitude
# ┌── 2. CALCULATION (The Physics) ────────────────────────────────────────
# Memory footprints
mnv1_fp32_mb = mnv1_params_m * 4 # FP32: 4 bytes/param
mnv1_int8_mb = mnv1_params_m * 1 # INT8: 1 byte/param
mnv2_fp32_mb = mnv2_params_m * 4
mnv2_int8_mb = mnv2_params_m * 1
# --- Energy calculations ---
energy_per_inf_mj_value = (
inference_power_mw_value * inference_latency_ms_value / 1000
)
energy_per_day_j_value = (
inferences_per_day_value * energy_per_inf_mj_value / 1000
)
# KWS reference (too small for 50-species task)
kws_example_params_k = KWS_DSCNN_PARAMS.m_as(Kparam)
kws_example_flops_mflops = KWS_DSCNN_FLOPs.m_as(MFLOPs)
# --- Outputs (formatted strings for prose) ---
mnv1_params_str = fmt(mnv1_params_m_value, precision=1, commas=False) # e.g. "4.2"
mnv1_flops_str = fmt(mnv1_flops_mflops_value, precision=0, commas=False) # e.g. "569"
mnv1_fp32_str = fmt(mnv1_fp32_mb_value, precision=0, commas=False) # e.g. "17"
mnv1_int8_str = fmt(mnv1_int8_mb_value, precision=0, commas=False) # e.g. "4"
mnv2_params_str = fmt(mnv2_params_m_value, precision=1, commas=False) # e.g. "2.2"
mnv2_flops_str = fmt(mnv2_flops_mflops_value, precision=0, commas=False) # e.g. "150"
mnv2_fp32_str = fmt(mnv2_fp32_mb_value, precision=0, commas=False) # e.g. "9"
mnv2_int8_str = fmt(mnv2_int8_mb_value, precision=1, commas=False) # e.g. "2.2"
kws_example_params_str = fmt(kws_example_params_k_value, precision=0, commas=False) # e.g. "26"
kws_example_flops_str = fmt(kws_example_flops_mflops_value, precision=0, commas=False) # e.g. "6"
energy_mj_str = fmt(energy_per_inf_mj_value, precision=0, commas=False) # e.g. "15"
energy_j_str = fmt(energy_per_day_j_value, precision=1, commas=False) # e.g. "1.5"
# Energy
energy_per_inf_mj = inference_power_mw * inference_latency_ms / 1000
energy_per_day_j = inferences_per_day * energy_per_inf_mj / 1000
# ┌── 3. INVARIANTS (Guardrails) ──────────────────────────────────────────
check(mnv2_int8_mb < 512, "MobileNetV2 INT8 must fit in 512 MB edge RAM.")
# ┌── 4. OUTPUTS (Formatting) ─────────────────────────────────────────────
mnv1_params_str = fmt(mnv1_params_m, precision=1, commas=False) # e.g. "4.2"
mnv1_flops_str = fmt(mnv1_flops_mflops, precision=0, commas=False) # e.g. "569"
mnv1_fp32_str = fmt(mnv1_fp32_mb, precision=0, commas=False) # e.g. "17"
mnv1_int8_str = fmt(mnv1_int8_mb, precision=0, commas=False) # e.g. "4"
mnv2_params_str = fmt(mnv2_params_m, precision=1, commas=False) # e.g. "2.2"
mnv2_flops_str = fmt(mnv2_flops_mflops, precision=0, commas=False) # e.g. "150"
mnv2_fp32_str = fmt(mnv2_fp32_mb, precision=0, commas=False) # e.g. "9"
mnv2_int8_str = fmt(mnv2_int8_mb, precision=1, commas=False) # e.g. "2.2"
kws_example_params_str = fmt(kws_example_params_k, precision=0, commas=False) # e.g. "26"
kws_example_flops_str = fmt(kws_example_flops_mflops, precision=0, commas=False) # e.g. "6"
energy_mj_str = fmt(energy_per_inf_mj, precision=0, commas=False) # e.g. "15"
energy_j_str = fmt(energy_per_day_j, precision=1, commas=False) # e.g. "1.5"
# ┌── EXPORTS (Bridge to Text) ─────────────────────────────────────────────────
mnv1_params_str = WildlifeModelSizing.mnv1_params_str
mnv1_flops_str = WildlifeModelSizing.mnv1_flops_str
mnv1_fp32_str = WildlifeModelSizing.mnv1_fp32_str
mnv1_int8_str = WildlifeModelSizing.mnv1_int8_str
mnv2_params_str = WildlifeModelSizing.mnv2_params_str
mnv2_flops_str = WildlifeModelSizing.mnv2_flops_str
mnv2_fp32_str = WildlifeModelSizing.mnv2_fp32_str
mnv2_int8_str = WildlifeModelSizing.mnv2_int8_str
kws_example_params_str = WildlifeModelSizing.kws_example_params_str
kws_example_flops_str = WildlifeModelSizing.kws_example_flops_str
energy_mj_str = WildlifeModelSizing.energy_mj_str
energy_j_str = WildlifeModelSizing.energy_j_str
```
With the throughput ceiling established, we can now apply the complete decision framework to a realistic scenario that exercises every step.
@@ -4099,11 +4184,23 @@ Engineers add attention to CNNs or convolutions to Transformers expecting additi
from mlsys.constants import A100_MEM_CAPACITY, GiB
# --- 8-GPU cluster memory ---
a100_8x_mem_value = int(A100_MEM_CAPACITY.to(GiB).magnitude) * 8
class A100ClusterMemory:
"""Contrast datacenter and edge memory: 8-GPU A100 node vs 4 GB edge device."""
# --- Outputs (formatted strings for prose) ---
a100_8x_mem_str = f"{a100_8x_mem_value}" # e.g. "640"
# ┌── 1. PARAMETERS (Inputs) ──────────────────────────────────────────────
n_gpus = 8
# ┌── 2. CALCULATION (The Physics) ────────────────────────────────────────
a100_8x_mem = int(A100_MEM_CAPACITY.m_as(GiB)) * n_gpus
# ┌── 3. INVARIANTS (Guardrails) ──────────────────────────────────────────
check(a100_8x_mem > 400, "8x A100 cluster should provide >400 GiB memory.")
# ┌── 4. OUTPUTS (Formatting) ─────────────────────────────────────────────
a100_8x_mem_str = f"{a100_8x_mem}" # e.g. "640"
# ┌── EXPORTS (Bridge to Text) ─────────────────────────────────────────────────
a100_8x_mem_str = A100ClusterMemory.a100_8x_mem_str
```
**Pitfall:** *Optimizing architectural decisions for training hardware without considering deployment constraints.*

File diff suppressed because it is too large Load Diff

View File

@@ -26,7 +26,6 @@ start_chapter("vol1:model_compression")
:::
## Purpose {.unnumbered}
\begin{marginfigure}
@@ -78,102 +77,137 @@ Bridging that gap requires a systematic discipline of *compression*: trading cap
from mlsys.constants import *
from mlsys.formatting import fmt, check, sci
# --- Inputs (GPU specs) ---
a100_tflops_fp16_value = A100_FLOPS_FP16_TENSOR.to(TFLOPs / second).magnitude
a100_tflops_int8_value = A100_FLOPS_INT8.to(TFLOPs / second).magnitude
a100_bw_tbs_value = A100_MEM_BW.to(TB / second).magnitude
a100_int8_speedup_value = int(a100_tflops_int8_value / a100_tflops_fp16_value)
class CompressionSetup:
"""Chapter-wide constants: GPU specs, energy physics, model sizes, device constraints."""
# --- Inputs (energy/perf illustrative values) ---
int8_energy_reduction_value = 20
mobilenet_int8_mj_value = 47
mobilenet_fp32_mj_value = 312
tpu_v4_tops_per_w_value = 0.9
v100_tops_per_w_value = 0.3
bandwidth_bound_speedup_value = 4
# ┌── 1. PARAMETERS (Inputs) ──────────────────────────────────────────────
# Illustrative energy/perf values
int8_energy_reduction = 20
mobilenet_int8_mj = 47
mobilenet_fp32_mj = 312
tpu_v4_tops_per_w = 0.9
v100_tops_per_w = 0.3
bandwidth_bound_speedup = 4
llm_7b_params = 7
gpt3_training_flops_exp = 23
# --- Inputs (energy: multiply-add operations from constants) ---
energy_dram_value = ENERGY_DRAM_ACCESS_PJ.magnitude
energy_dram_per_byte_value = ENERGY_DRAM_PJ_PER_BYTE.magnitude
energy_flop_fp32_value = ENERGY_FLOP_FP32_PJ.magnitude
energy_flop_int8_value = ENERGY_FLOP_INT8_PJ.magnitude
# ┌── 2. CALCULATION (The Physics) ────────────────────────────────────────
# A100 specs
a100_tflops_fp16 = A100_FLOPS_FP16_TENSOR.m_as(TFLOPs / second)
a100_tflops_int8 = A100_FLOPS_INT8.m_as(TFLOPs / second)
a100_bw_tbs = A100_MEM_BW.m_as(TB / second)
a100_int8_speedup = int(a100_tflops_int8 / a100_tflops_fp16)
# Energy for addition operations (Horowitz 2014, 45nm process)
energy_add_fp32_pj_value = ENERGY_ADD_FP32_PJ.to(ureg.picojoule).magnitude
energy_add_fp16_pj_value = ENERGY_ADD_FP16_PJ.to(ureg.picojoule).magnitude
energy_add_int32_pj_value = ENERGY_ADD_INT32_PJ.to(ureg.picojoule).magnitude
energy_add_int8_pj_value = ENERGY_ADD_INT8_PJ.to(ureg.picojoule).magnitude
energy_mul_fp32_pj_value = ENERGY_FLOP_FP32_PJ.magnitude
# Energy from constants (Horowitz 2014, 45nm process)
energy_dram = ENERGY_DRAM_ACCESS_PJ.m_as(ureg.picojoule)
energy_dram_per_byte = ENERGY_DRAM_PJ_PER_BYTE.m_as(ureg.picojoule / ureg.byte)
energy_flop_fp32 = ENERGY_FLOP_FP32_PJ.m_as(ureg.picojoule / ureg.count)
energy_flop_int8 = ENERGY_FLOP_INT8_PJ.m_as(ureg.picojoule / ureg.count)
energy_add_fp32_pj = ENERGY_ADD_FP32_PJ.m_as(ureg.picojoule)
energy_add_fp16_pj = ENERGY_ADD_FP16_PJ.m_as(ureg.picojoule)
energy_add_int32_pj = ENERGY_ADD_INT32_PJ.m_as(ureg.picojoule)
energy_add_int8_pj = ENERGY_ADD_INT8_PJ.m_as(ureg.picojoule)
energy_mul_fp32_pj = ENERGY_FLOP_FP32_PJ.m_as(ureg.picojoule / ureg.count)
# INT8 vs FP32 energy ratio (MAC-to-MAC: multiply + add for each precision)
fp32_mac_pj_value = energy_mul_fp32_pj_value + energy_add_fp32_pj_value # 3.7 + 0.9 = 4.6 pJ
int8_mac_pj_value = energy_flop_int8_value + energy_add_int8_pj_value # 0.2 + 0.03 = 0.23 pJ
int8_fp32_energy_ratio_value = fp32_mac_pj_value / int8_mac_pj_value
# INT8 vs FP32 MAC energy ratio
fp32_mac_pj = energy_mul_fp32_pj + energy_add_fp32_pj # 3.7 + 0.9 = 4.6 pJ
int8_mac_pj = energy_flop_int8 + energy_add_int8_pj # 0.2 + 0.03 = 0.23 pJ
int8_fp32_energy_ratio = fp32_mac_pj / int8_mac_pj
# V100 specs
v100_bw_gbs_value = V100_MEM_BW.to(GB / second).magnitude
v100_tflops_fp32_value = V100_FLOPS_FP32.to(TFLOPs / second).magnitude
# V100 specs
v100_bw_gbs = V100_MEM_BW.m_as(GB / second)
v100_tflops_fp32 = V100_FLOPS_FP32.m_as(TFLOPs / second)
# Model specs
resnet_params_m_value = RESNET50_PARAMS.to(Mparam).magnitude
resnet_gflops_value = RESNET50_FLOPs.to(GFLOPs).magnitude
mobilenetv2_mflops_value = MOBILENETV2_FLOPs.to(GFLOPs).magnitude * 1000
# Model specs
resnet_params_m = RESNET50_PARAMS.m_as(Mparam)
resnet_gflops = RESNET50_FLOPs.m_as(GFLOPs)
mobilenetv2_mflops = MOBILENETV2_FLOPs.m_as(GFLOPs) * 1000
# LLM parameter/memory calculations
llm_7b_params_value = 7
llm_7b_mem_fp16_gb_value = llm_7b_params_value * 2
llm_175b_params_value = GPT3_PARAMS.to(Bparam).magnitude
llm_175b_mem_fp16_gb_value = llm_175b_params_value * 2
# LLM memory
llm_7b_mem_fp16_gb = llm_7b_params * 2
llm_175b_params = GPT3_PARAMS.m_as(Bparam)
llm_175b_mem_fp16_gb = llm_175b_params * 2
# Device memory constraints
smartphone_ram_gb_value = SMARTPHONE_RAM_GB.to(GB).magnitude
mcu_ram_kb_value = MCU_RAM_KIB.to(KiB).magnitude
# Device memory
smartphone_ram_gb = SMARTPHONE_RAM_GB.m_as(GB)
mcu_ram_kb = MCU_RAM_KIB.m_as(KiB)
# GPT-3 training FLOPs
gpt3_training_flops_exp_value = 23
# ┌── 3. INVARIANTS (Guardrails) ──────────────────────────────────────────
check(a100_int8_speedup >= 2, "A100 INT8 should be at least 2x faster than FP16.")
check(int8_fp32_energy_ratio > 1, "FP32 MAC must cost more energy than INT8 MAC.")
# --- Outputs (formatted strings for prose) ---
a100_tflops_fp16_str = fmt(a100_tflops_fp16_value, precision=0, commas=False)
a100_tflops_int8_str = fmt(a100_tflops_int8_value, precision=0, commas=False)
a100_bw_tbs_str = fmt(a100_bw_tbs_value, precision=1, commas=False)
a100_int8_speedup_str = fmt(a100_int8_speedup_value, precision=0, commas=False)
int8_energy_reduction_str = fmt(int8_energy_reduction_value, precision=0, commas=False)
mobilenet_int8_mj_str = fmt(mobilenet_int8_mj_value, precision=0, commas=False)
mobilenet_fp32_mj_str = fmt(mobilenet_fp32_mj_value, precision=0, commas=False)
tpu_v4_tops_per_w_str = fmt(tpu_v4_tops_per_w_value, precision=1, commas=False)
v100_tops_per_w_str = fmt(v100_tops_per_w_value, precision=1, commas=False)
bandwidth_bound_speedup_str = fmt(bandwidth_bound_speedup_value, precision=0, commas=False)
# ┌── 4. OUTPUTS (Formatting) ─────────────────────────────────────────────
a100_tflops_fp16_str = fmt(a100_tflops_fp16, precision=0, commas=False)
a100_tflops_int8_str = fmt(a100_tflops_int8, precision=0, commas=False)
a100_bw_tbs_str = fmt(a100_bw_tbs, precision=1, commas=False)
a100_int8_speedup_str = fmt(a100_int8_speedup, precision=0, commas=False)
int8_energy_reduction_str = fmt(int8_energy_reduction, precision=0, commas=False)
mobilenet_int8_mj_str = fmt(mobilenet_int8_mj, precision=0, commas=False)
mobilenet_fp32_mj_str = fmt(mobilenet_fp32_mj, precision=0, commas=False)
tpu_v4_tops_per_w_str = fmt(tpu_v4_tops_per_w, precision=1, commas=False)
v100_tops_per_w_str = fmt(v100_tops_per_w, precision=1, commas=False)
bandwidth_bound_speedup_str = fmt(bandwidth_bound_speedup, precision=0, commas=False)
energy_dram_str = fmt(energy_dram, precision=0, commas=False)
energy_dram_per_byte_str = fmt(energy_dram_per_byte, precision=0, commas=False)
energy_flop_fp32_str = f"{energy_flop_fp32}"
energy_flop_int8_str = f"{energy_flop_int8}"
energy_add_fp32_str = f"{energy_add_fp32_pj}"
energy_add_fp16_str = f"{energy_add_fp16_pj}"
energy_add_int32_str = f"{energy_add_int32_pj}"
energy_add_int8_str = f"{energy_add_int8_pj}"
energy_mul_fp32_str = f"{energy_mul_fp32_pj}"
int8_fp32_energy_ratio_str = fmt(int8_fp32_energy_ratio, precision=1, commas=False)
v100_bw_gbs_str = fmt(v100_bw_gbs, precision=0, commas=False)
v100_tflops_fp32_str = fmt(v100_tflops_fp32, precision=1, commas=False)
resnet_params_m_str = fmt(resnet_params_m, precision=1, commas=False)
resnet_gflops_str = fmt(resnet_gflops, precision=1, commas=False)
mobilenetv2_mflops_str = fmt(mobilenetv2_mflops, precision=0, commas=False)
llm_7b_str = f"{llm_7b_params}"
llm_7b_mem_str = fmt(llm_7b_mem_fp16_gb, precision=0, commas=False)
llm_175b_str = fmt(llm_175b_params, precision=0, commas=False)
llm_175b_mem_str = fmt(llm_175b_mem_fp16_gb, precision=0, commas=False)
smartphone_ram_str = f"{smartphone_ram_gb}"
mcu_ram_str = f"{mcu_ram_kb}"
gpt3_training_flops_str = f"$3.14 \\times 10^{{{gpt3_training_flops_exp}}}$"
energy_dram_str = fmt(energy_dram_value, precision=0, commas=False)
energy_dram_per_byte_str = fmt(energy_dram_per_byte_value, precision=0, commas=False)
energy_flop_fp32_str = f"{energy_flop_fp32_value}"
energy_flop_int8_str = f"{energy_flop_int8_value}"
energy_add_fp32_str = f"{energy_add_fp32_pj_value}"
energy_add_fp16_str = f"{energy_add_fp16_pj_value}"
energy_add_int32_str = f"{energy_add_int32_pj_value}"
energy_add_int8_str = f"{energy_add_int8_pj_value}"
energy_mul_fp32_str = f"{energy_mul_fp32_pj_value}"
int8_fp32_energy_ratio_str = fmt(int8_fp32_energy_ratio_value, precision=1, commas=False)
v100_bw_gbs_str = fmt(v100_bw_gbs_value, precision=0, commas=False)
v100_tflops_fp32_str = fmt(v100_tflops_fp32_value, precision=1, commas=False)
resnet_params_m_str = fmt(resnet_params_m_value, precision=1, commas=False)
resnet_gflops_str = fmt(resnet_gflops_value, precision=1, commas=False)
mobilenetv2_mflops_str = fmt(mobilenetv2_mflops_value, precision=0, commas=False)
llm_7b_str = f"{llm_7b_params_value}"
llm_7b_mem_str = fmt(llm_7b_mem_fp16_gb_value, precision=0, commas=False)
llm_175b_str = fmt(llm_175b_params_value, precision=0, commas=False)
llm_175b_mem_str = fmt(llm_175b_mem_fp16_gb_value, precision=0, commas=False)
smartphone_ram_str = f"{smartphone_ram_gb_value}"
mcu_ram_str = f"{mcu_ram_kb_value}"
gpt3_training_flops_str = f"$3.14 \\times 10^{{{gpt3_training_flops_exp_value}}}$"
# ┌── EXPORTS (Bridge to Text) ─────────────────────────────────────────────────
a100_tflops_fp16_str = CompressionSetup.a100_tflops_fp16_str
a100_tflops_int8_str = CompressionSetup.a100_tflops_int8_str
a100_bw_tbs_str = CompressionSetup.a100_bw_tbs_str
a100_int8_speedup_str = CompressionSetup.a100_int8_speedup_str
int8_energy_reduction_str = CompressionSetup.int8_energy_reduction_str
mobilenet_int8_mj_str = CompressionSetup.mobilenet_int8_mj_str
mobilenet_fp32_mj_str = CompressionSetup.mobilenet_fp32_mj_str
tpu_v4_tops_per_w_str = CompressionSetup.tpu_v4_tops_per_w_str
v100_tops_per_w_str = CompressionSetup.v100_tops_per_w_str
bandwidth_bound_speedup_str = CompressionSetup.bandwidth_bound_speedup_str
energy_dram_str = CompressionSetup.energy_dram_str
energy_dram_per_byte_str = CompressionSetup.energy_dram_per_byte_str
energy_flop_fp32_str = CompressionSetup.energy_flop_fp32_str
energy_flop_int8_str = CompressionSetup.energy_flop_int8_str
energy_add_fp32_str = CompressionSetup.energy_add_fp32_str
energy_add_fp16_str = CompressionSetup.energy_add_fp16_str
energy_add_int32_str = CompressionSetup.energy_add_int32_str
energy_add_int8_str = CompressionSetup.energy_add_int8_str
energy_mul_fp32_str = CompressionSetup.energy_mul_fp32_str
int8_fp32_energy_ratio_str = CompressionSetup.int8_fp32_energy_ratio_str
v100_bw_gbs_str = CompressionSetup.v100_bw_gbs_str
v100_tflops_fp32_str = CompressionSetup.v100_tflops_fp32_str
resnet_params_m_str = CompressionSetup.resnet_params_m_str
resnet_gflops_str = CompressionSetup.resnet_gflops_str
mobilenetv2_mflops_str = CompressionSetup.mobilenetv2_mflops_str
llm_7b_str = CompressionSetup.llm_7b_str
llm_7b_mem_str = CompressionSetup.llm_7b_mem_str
llm_175b_str = CompressionSetup.llm_175b_str
llm_175b_mem_str = CompressionSetup.llm_175b_mem_str
smartphone_ram_str = CompressionSetup.smartphone_ram_str
mcu_ram_str = CompressionSetup.mcu_ram_str
gpt3_training_flops_str = CompressionSetup.gpt3_training_flops_str
# Note: v100_bw_gbs_value used by downstream fusion-calc cell
v100_bw_gbs_value = CompressionSetup.v100_bw_gbs
v100_tflops_fp32_value = CompressionSetup.v100_tflops_fp32
```
## Optimization Framework {#sec-model-compression-optimization-framework-9e21}
A `{python} llm_7b_str`-billion parameter language model requires `{python} llm_7b_mem_str` GB just to store its weights in FP16. Your deployment target is a smartphone with `{python} smartphone_ram_str` GB of RAM shared across the operating system, applications, and your model. *The math does not work.* No amount of clever engineering changes this arithmetic: `{python} llm_7b_mem_str` GB cannot fit in `{python} smartphone_ram_str` GB. Yet users expect the model to run: responsively, offline, without draining their battery in an hour. The gap between what training produces and what deployment permits (the Latency Budget, the maximum allowable end-to-end inference time, defined formally in @sec-model-serving) is not a minor inconvenience but a defining challenge of model compression.
@@ -420,7 +454,6 @@ We call this phenomenon *the quantization speedup*.
The relative importance of each dimension varies by deployment target. Cloud systems may tolerate larger models but demand throughput; mobile devices prioritize memory and energy; embedded systems face hard constraints on all resources simultaneously. Understanding these deployment contexts shapes which optimization dimensions to prioritize.
## Deployment Context {#sec-model-compression-deployment-context-0d88}
The optimization framework above identifies three dimensions of compression, but which dimensions matter most depends entirely on where the model will run. A datacenter GPU with 80 GB of HBM faces different binding constraints than a smartphone with shared RAM or a microcontroller with 256 KB of SRAM. @tbl-deployment-scenarios summarizes the key constraints across deployment environments.
@@ -482,55 +515,80 @@ from mlsys.constants import (GB, GiB, MiB, KiB, MB, KB, byte,
CLOUD_MEM_GIB, MOBILE_MEM_GIB, TINY_MEM_KIB,
DLRM_MODEL_SIZE_FP32)
# --- Inputs (device capacities and model sizes) ---
cloud_mem_value = CLOUD_MEM_GIB
mobile_mem_value = MOBILE_MEM_GIB
tiny_mem_value = TINY_MEM_KIB
dlrm_mem_value = DLRM_MODEL_SIZE_FP32
gpt2_mem_value = 6 * GiB
resnet_mem_value = 100 * MiB
mobilenet_mem_value = 14 * MiB
mobilenet_int8_mem_value = 3.5 * MiB
dscnn_mem_value = 500 * KiB
# --- Process (compute fit ratios) ---
def get_ratio(model_mem, device_mem):
ratio = model_mem.to(byte).magnitude / device_mem.to(byte).magnitude
def _get_ratio(model_mem, device_mem):
"""Return 'ok' if model fits, else 'no (Nx)' with how many times it overflows."""
ratio = model_mem.m_as(byte) / device_mem.m_as(byte)
if ratio < 1:
return "ok"
return f"no ({ratio:.0f}x)"
dlrm_mobile_value = get_ratio(dlrm_mem_value, mobile_mem_value)
dlrm_tiny_value = get_ratio(dlrm_mem_value, tiny_mem_value)
class ModelDeviceComparison:
"""Contrast model requirements with device memory: 6-order-of-magnitude deployment gap."""
gpt2_mobile_value = get_ratio(gpt2_mem_value, mobile_mem_value)
gpt2_tiny_value = get_ratio(gpt2_mem_value, tiny_mem_value)
# ┌── 1. PARAMETERS (Inputs) ──────────────────────────────────────────────
# Device capacities
cloud_mem = CLOUD_MEM_GIB
mobile_mem = MOBILE_MEM_GIB
tiny_mem = TINY_MEM_KIB
resnet_tiny_value = get_ratio(resnet_mem_value, tiny_mem_value)
mobilenet_tiny_value = get_ratio(mobilenet_mem_value, tiny_mem_value)
mobilenet_int8_tiny_value = get_ratio(mobilenet_int8_mem_value, tiny_mem_value)
# Model sizes
dlrm_mem = DLRM_MODEL_SIZE_FP32
gpt2_mem = 6 * GiB
resnet_mem = 100 * MiB
mobilenet_mem = 14 * MiB
mobilenet_int8_mem = 3.5 * MiB
dscnn_mem = 500 * KiB
# --- Outputs (formatted strings for prose) ---
dlrm_str = f"{dlrm_mem_value.to(GB).magnitude:.0f} GB"
gpt2_str = f"{gpt2_mem_value.to(GiB).magnitude:.0f} GB"
resnet_str = f"{resnet_mem_value.to(MiB).magnitude:.0f} MB"
mobilenet_str = f"{mobilenet_mem_value.to(MiB).magnitude:.0f} MB"
mobilenet_int8_str = f"{mobilenet_int8_mem_value.to(MiB).magnitude:.1f} MB"
dscnn_str = f"{dscnn_mem_value.to(KiB).magnitude:.0f} KB"
# ┌── 2. CALCULATION (The Physics) ────────────────────────────────────────
dlrm_mobile = _get_ratio(dlrm_mem, mobile_mem)
dlrm_tiny = _get_ratio(dlrm_mem, tiny_mem)
gpt2_mobile = _get_ratio(gpt2_mem, mobile_mem)
gpt2_tiny = _get_ratio(gpt2_mem, tiny_mem)
resnet_tiny = _get_ratio(resnet_mem, tiny_mem)
mobilenet_tiny = _get_ratio(mobilenet_mem, tiny_mem)
mobilenet_int8_tiny = _get_ratio(mobilenet_int8_mem, tiny_mem)
cloud_cap_str = f"~{cloud_mem_value.to(GiB).magnitude:.0f} GB"
mobile_cap_str = f"~{mobile_mem_value.to(GiB).magnitude:.0f} GB"
tiny_cap_str = f"~{tiny_mem_value.to(KiB).magnitude:.0f} KB"
# ┌── 3. INVARIANTS (Guardrails) ──────────────────────────────────────────
# DS-CNN always fits TinyML — sanity check
assert _get_ratio(dscnn_mem, tiny_mem) == "ok", "DS-CNN must fit in TinyML device."
dlrm_mobile_str = dlrm_mobile_value
dlrm_tiny_str = dlrm_tiny_value
gpt2_mobile_str = gpt2_mobile_value
gpt2_tiny_str = gpt2_tiny_value
resnet_tiny_str = resnet_tiny_value
mobilenet_tiny_str = mobilenet_tiny_value
mobilenet_int8_tiny_str = mobilenet_int8_tiny_value
dscnn_tiny_str = "ok"
# ┌── 4. OUTPUTS (Formatting) ─────────────────────────────────────────────
dlrm_str = f"{dlrm_mem.m_as(GB):.0f} GB"
gpt2_str = f"{gpt2_mem.m_as(GiB):.0f} GB"
resnet_str = f"{resnet_mem.m_as(MiB):.0f} MB"
mobilenet_str = f"{mobilenet_mem.m_as(MiB):.0f} MB"
mobilenet_int8_str = f"{mobilenet_int8_mem.m_as(MiB):.1f} MB"
dscnn_str = f"{dscnn_mem.m_as(KiB):.0f} KB"
cloud_cap_str = f"~{cloud_mem.m_as(GiB):.0f} GB"
mobile_cap_str = f"~{mobile_mem.m_as(GiB):.0f} GB"
tiny_cap_str = f"~{tiny_mem.m_as(KiB):.0f} KB"
dlrm_mobile_str = dlrm_mobile
dlrm_tiny_str = dlrm_tiny
gpt2_mobile_str = gpt2_mobile
gpt2_tiny_str = gpt2_tiny
resnet_tiny_str = resnet_tiny
mobilenet_tiny_str = mobilenet_tiny
mobilenet_int8_tiny_str = mobilenet_int8_tiny
dscnn_tiny_str = "ok"
# ┌── EXPORTS (Bridge to Text) ─────────────────────────────────────────────────
dlrm_str = ModelDeviceComparison.dlrm_str
gpt2_str = ModelDeviceComparison.gpt2_str
resnet_str = ModelDeviceComparison.resnet_str
mobilenet_str = ModelDeviceComparison.mobilenet_str
mobilenet_int8_str = ModelDeviceComparison.mobilenet_int8_str
dscnn_str = ModelDeviceComparison.dscnn_str
cloud_cap_str = ModelDeviceComparison.cloud_cap_str
mobile_cap_str = ModelDeviceComparison.mobile_cap_str
tiny_cap_str = ModelDeviceComparison.tiny_cap_str
dlrm_mobile_str = ModelDeviceComparison.dlrm_mobile_str
dlrm_tiny_str = ModelDeviceComparison.dlrm_tiny_str
gpt2_mobile_str = ModelDeviceComparison.gpt2_mobile_str
gpt2_tiny_str = ModelDeviceComparison.gpt2_tiny_str
resnet_tiny_str = ModelDeviceComparison.resnet_tiny_str
mobilenet_tiny_str = ModelDeviceComparison.mobilenet_tiny_str
mobilenet_int8_tiny_str = ModelDeviceComparison.mobilenet_int8_tiny_str
dscnn_tiny_str = ModelDeviceComparison.dscnn_tiny_str
```
| **Model** | **Memory** **(Runtime)** | **Storage** **(Weights)** | **Cloud** **(`{python} cloud_cap_str`)** | **Mobile** **(`{python} mobile_cap_str`)** | **TinyML** **(`{python} tiny_cap_str`)** |
@@ -600,7 +658,6 @@ Optimization is about trading one resource for another.
Each deployment context above imposes a binding constraint: memory capacity on mobile devices, latency on real-time systems, energy on battery-powered sensors. The optimization techniques that follow address these constraints at three successive levels of the stack. We begin with structural methods that modify *what* computations occur, reducing the model's parameter count and operation count to fit tighter memory and compute budgets. We then turn to precision techniques that reduce how many bits represent each value, directly shrinking memory footprint and accelerating arithmetic. Finally, we address architectural approaches that improve how efficiently the remaining operations execute on physical hardware, closing the gap between theoretical savings and measured performance.
## Structural Optimization {#sec-model-compression-structural-optimization-ee93}
\index{Model Compression!structural optimization}
@@ -2764,7 +2821,6 @@ Test your understanding of the structural optimization techniques covered so far
- [ ] Can you identify when to choose Neural Architecture Search over manual architecture design? Consider the trade-offs in computational cost, design space coverage, and hardware-specific optimization.
:::
## Quantization and Precision {#sec-model-compression-quantization-precision-cd46}
\index{Model Compression!precision optimization}
@@ -3690,44 +3746,57 @@ Compare the two mapping diagrams side by side in @fig-calibration-ranges. Symmet
# │ zero_point_str, x_val_str, x_q_str, x_recon_str
# └─────────────────────────────────────────────────────────────────────────────
from mlsys.formatting import fmt, check
from mlsys.constants import KIB_TO_BYTES
# --- Inputs (activation range example) ---
alpha_value = -1.0
beta_value = 3.0
bits_value = 8
x_val_value = 0.0 # value to quantize
class QuantizationMathCalc:
"""Derive affine quantization parameters: scale and zero-point for [-1.0, 3.0] → UINT8."""
# --- Process (calculate affine parameters) ---
# 1. Calculate Scale (s)
# s = (beta - alpha) / (2^b - 1)
int_steps_value = 2**bits_value - 1
scale_value = (beta_value - alpha_value) / int_steps_value
# ┌── 1. PARAMETERS (Inputs) ──────────────────────────────────────────────
alpha = -1.0 # activation range min
beta = 3.0 # activation range max
bits = 8 # target bit-width
x_val = 0.0 # value to quantize
# 2. Calculate Zero-Point (z)
# z = round(-alpha / s)
# Note: z maps the real value 0.0 to an integer
zero_point_value = round(-alpha_value / scale_value)
# ┌── 2. CALCULATION (The Physics) ────────────────────────────────────────
# 1. Scale: s = (beta - alpha) / (2^b - 1)
int_steps = 2**bits - 1
scale = (beta - alpha) / int_steps
# 3. Quantize a value
# x_q = clamp(round(x / s) + z, 0, 2^b - 1)
x_q_raw = round(x_val_value / scale_value) + zero_point_value
x_q_value = max(0, min(int_steps_value, x_q_raw))
# 2. Zero-point: z = round(-alpha / s)
zero_point = round(-alpha / scale)
# 4. Dequantize (reconstruct)
# x_recon = (x_q - z) * s
x_recon_value = (x_q_value - zero_point_value) * scale_value
# 3. Quantize: x_q = clamp(round(x/s) + z, 0, 2^b - 1)
x_q_raw = round(x_val / scale) + zero_point
x_q = max(0, min(int_steps, x_q_raw))
# --- Outputs (formatted strings for prose) ---
alpha_str = fmt(alpha_value, precision=1, commas=False) # "-1.0"
beta_str = fmt(beta_value, precision=1, commas=False) # "3.0"
range_str = fmt(beta_value - alpha_value, precision=1, commas=False) # "4.0"
steps_str = f"{int_steps_value}" # "255"
scale_str = fmt(scale_value, precision=4, commas=False) # "0.0157"
zero_point_str = f"{int(zero_point_value)}" # "64"
x_val_str = fmt(x_val_value, precision=1, commas=False) # "0.0"
x_q_str = f"{int(x_q_value)}" # "64"
x_recon_str = fmt(x_recon_value, precision=2, commas=False) # "0.00"
# 4. Dequantize: x_recon = (x_q - z) * s
x_recon = (x_q - zero_point) * scale
# ┌── 3. INVARIANTS (Guardrails) ──────────────────────────────────────────
check(scale > 0, "Scale must be positive.")
check(0 <= zero_point <= int_steps, "Zero-point must be in valid integer range.")
check(abs(x_recon - x_val) < scale, "Reconstruction error must be less than one step size.")
# ┌── 4. OUTPUTS (Formatting) ─────────────────────────────────────────────
alpha_str = fmt(alpha, precision=1, commas=False) # "-1.0"
beta_str = fmt(beta, precision=1, commas=False) # "3.0"
range_str = fmt(beta - alpha, precision=1, commas=False) # "4.0"
steps_str = f"{int_steps}" # "255"
scale_str = fmt(scale, precision=4, commas=False) # "0.0157"
zero_point_str = f"{int(zero_point)}" # "64"
x_val_str = fmt(x_val, precision=1, commas=False) # "0.0"
x_q_str = f"{int(x_q)}" # "64"
x_recon_str = fmt(x_recon, precision=2, commas=False) # "0.00"
# ┌── EXPORTS (Bridge to Text) ─────────────────────────────────────────────────
alpha_str = QuantizationMathCalc.alpha_str
beta_str = QuantizationMathCalc.beta_str
range_str = QuantizationMathCalc.range_str
steps_str = QuantizationMathCalc.steps_str
scale_str = QuantizationMathCalc.scale_str
zero_point_str = QuantizationMathCalc.zero_point_str
x_val_str = QuantizationMathCalc.x_val_str
x_q_str = QuantizationMathCalc.x_q_str
x_recon_str = QuantizationMathCalc.x_recon_str
```
::: {.callout-notebook title="Calculating Scale and Zero-Point"}
@@ -4326,7 +4395,6 @@ Yet practitioners often discover a frustrating gap between theory and practice:
The gap arises from several sources. Sparse matrices stored in dense format waste memory bandwidth loading zeros—the hardware cannot skip what it does not know is zero. Operations that could run in parallel execute sequentially due to data dependencies the compiler cannot resolve. Simple inputs receive the same computational budget as complex ones because the model has no mechanism to exit early. Closing the gap between "optimized on paper" and "optimized in practice" is the domain of our third optimization dimension: **architectural efficiency**. This dimension ensures that structural and precision optimizations translate into real-world speedups by aligning computation patterns with hardware capabilities.
## Architectural Efficiency {#sec-model-compression-architectural-efficiency-8dd3}
Architectural efficiency optimization ensures that computations execute efficiently on target hardware by aligning model operations with processor capabilities and memory hierarchies. Where representation optimization determines *what* computations to perform and precision optimization determines *how precisely* to compute, architectural efficiency addresses *how* operations are scheduled, memory is accessed, and workloads adapt to input characteristics. This third dimension closes the gap between theoretical compression ratios and real-world speedups.
@@ -4452,77 +4520,102 @@ Beyond reducing what data must be stored, substantial efficiency gains emerge fr
# │ kernels_fused_str, saved_latency_ms_str
# └─────────────────────────────────────────────────────────────────────────────
from mlsys.formatting import fmt, check
from mlsys.constants import KIB_TO_BYTES
from mlsys.constants import KIB_TO_BYTES, MILLION
# --- Inputs (Conv-BN-ReLU) ---
conv_channels_value = 256
conv_spatial_value = 28
bytes_per_element_value = 4
class FusionCalc:
"""Quantify latency and bandwidth benefits of Conv-BN-ReLU operator fusion on ResNet-50."""
# GEMM
gemm_hidden_value = 768
gemm_seq_value = 512
# ┌── 1. PARAMETERS (Inputs) ──────────────────────────────────────────────
# Conv-BN-ReLU layer geometry
conv_channels = 256
conv_spatial = 28
bytes_per_element = 4 # FP32
# Memory Bandwidth Analysis (ResNet-50 layer)
# Feature map: 256 channels × 28 × 28 spatial × 4 bytes/element (FP32)
feat_map_mb_value = conv_channels_value * conv_spatial_value * conv_spatial_value * bytes_per_element_value / MILLION # SI MB
weights_mb_value = 2.4
bn_params_mb_value = 0.002
# GEMM geometry
gemm_hidden = 768
gemm_seq = 512
# Kernel Launch
kernels_unfused_value = 159
kernels_fused_value = 53
latency_per_kernel_us_value = 10
# ResNet-50 layer memory baseline
weights_mb = 2.4
bn_params_mb = 0.002
# --- Process ---
# Conv-BN-ReLU intermediate
conv_bn_relu_intermediate_bytes = 2 * conv_channels_value * conv_spatial_value * conv_spatial_value * bytes_per_element_value
conv_bn_relu_intermediate_mb_value = conv_bn_relu_intermediate_bytes / (1024**2)
# Kernel launch overhead
kernels_unfused = 159
kernels_fused = 53
latency_per_kernel_us = 10
# GEMM intermediate
gemm_intermediate_bytes = gemm_hidden_value * gemm_seq_value * bytes_per_element_value
gemm_intermediate_mb_value = gemm_intermediate_bytes / (1024**2)
# ┌── 2. CALCULATION (The Physics) ────────────────────────────────────────
# Feature map size (SI MB)
feat_map_mb = conv_channels * conv_spatial * conv_spatial * bytes_per_element / MILLION
# Bandwidth Analysis
unfused_conv_mb_value = feat_map_mb_value * 2 + weights_mb_value
unfused_bn_mb_value = feat_map_mb_value * 2 + bn_params_mb_value
unfused_relu_mb_value = feat_map_mb_value * 2
total_unfused_mb_value = unfused_conv_mb_value + unfused_bn_mb_value + unfused_relu_mb_value
# Conv-BN-ReLU intermediate (2 feature maps written: conv→BN boundary)
conv_bn_relu_intermediate_mb = (
2 * conv_channels * conv_spatial * conv_spatial * bytes_per_element / (1024**2)
)
total_fused_mb_value = feat_map_mb_value * 2 + weights_mb_value
bandwidth_reduction_pct_value = (1 - total_fused_mb_value / total_unfused_mb_value) * 100
# GEMM intermediate
gemm_intermediate_mb = gemm_hidden * gemm_seq * bytes_per_element / (1024**2)
# Kernel Launch
saved_latency_us_value = (kernels_unfused_value - kernels_fused_value) * latency_per_kernel_us_value
saved_latency_ms_value = saved_latency_us_value / 1000
# Unfused bandwidth: Conv (feat*2 + weights) + BN (feat*2 + bn) + ReLU (feat*2)
unfused_conv_mb = feat_map_mb * 2 + weights_mb
unfused_bn_mb = feat_map_mb * 2 + bn_params_mb
unfused_relu_mb = feat_map_mb * 2
total_unfused_mb = unfused_conv_mb + unfused_bn_mb + unfused_relu_mb
# V100 timing analysis (memory-bound)
v100_bw_gbs_local_value = v100_bw_gbs_value # from earlier cell
unfused_time_us_value = total_unfused_mb_value / v100_bw_gbs_local_value * 1000 # MB / (GB/s) * 1000 = us
fused_time_us_value = total_fused_mb_value / v100_bw_gbs_local_value * 1000
fusion_speedup_value = unfused_time_us_value / fused_time_us_value
# Fused bandwidth: read input + weights once, write output once
total_fused_mb = feat_map_mb * 2 + weights_mb
bandwidth_reduction_pct = (1 - total_fused_mb / total_unfused_mb) * 100
# --- Outputs (formatted strings for prose) ---
conv_bn_relu_intermediate_mb_str = fmt(conv_bn_relu_intermediate_mb_value, precision=1, commas=False)
gemm_intermediate_mb_str = fmt(gemm_intermediate_mb_value, precision=1, commas=False)
# Kernel launch savings
saved_latency_us = (kernels_unfused - kernels_fused) * latency_per_kernel_us
saved_latency_ms = saved_latency_us / 1000
feat_map_kb_str = fmt(feat_map_mb_value * 1000, precision=0, commas=False)
weights_mb_str = fmt(weights_mb_value, precision=1, commas=False)
bn_params_kb_str = fmt(bn_params_mb_value * KIB_TO_BYTES, precision=0, commas=False)
# V100 timing (memory-bound): MB / (GB/s) * 1000 = µs
unfused_time_us = total_unfused_mb / v100_bw_gbs_value * 1000
fused_time_us = total_fused_mb / v100_bw_gbs_value * 1000
fusion_speedup = unfused_time_us / fused_time_us
unfused_conv_mb_str = fmt(unfused_conv_mb_value, precision=1, commas=False)
unfused_bn_mb_str = fmt(unfused_bn_mb_value, precision=1, commas=False)
unfused_relu_mb_str = fmt(unfused_relu_mb_value, precision=1, commas=False)
total_unfused_mb_str = fmt(total_unfused_mb_value, precision=1, commas=False)
total_fused_mb_str = fmt(total_fused_mb_value, precision=1, commas=False)
bandwidth_reduction_pct_str = fmt(bandwidth_reduction_pct_value, precision=0, commas=False)
# ┌── 3. INVARIANTS (Guardrails) ──────────────────────────────────────────
check(bandwidth_reduction_pct > 40, "Fusion should reduce bandwidth by more than 40%.")
check(fusion_speedup > 1, "Fused execution must be faster than unfused.")
kernels_unfused_str = fmt(kernels_unfused_value, precision=0, commas=False)
kernels_fused_str = fmt(kernels_fused_value, precision=0, commas=False)
saved_latency_ms_str = fmt(saved_latency_ms_value, precision=0, commas=False)
unfused_time_us_str = fmt(unfused_time_us_value, precision=0, commas=False)
fused_time_us_str = fmt(fused_time_us_value, precision=1, commas=False)
fusion_speedup_str = fmt(fusion_speedup_value, precision=2, commas=False)
# ┌── 4. OUTPUTS (Formatting) ─────────────────────────────────────────────
conv_bn_relu_intermediate_mb_str = fmt(conv_bn_relu_intermediate_mb, precision=1, commas=False)
gemm_intermediate_mb_str = fmt(gemm_intermediate_mb, precision=1, commas=False)
feat_map_kb_str = fmt(feat_map_mb * 1000, precision=0, commas=False)
weights_mb_str = fmt(weights_mb, precision=1, commas=False)
bn_params_kb_str = fmt(bn_params_mb * KIB_TO_BYTES, precision=0, commas=False)
unfused_conv_mb_str = fmt(unfused_conv_mb, precision=1, commas=False)
unfused_bn_mb_str = fmt(unfused_bn_mb, precision=1, commas=False)
unfused_relu_mb_str = fmt(unfused_relu_mb, precision=1, commas=False)
total_unfused_mb_str = fmt(total_unfused_mb, precision=1, commas=False)
total_fused_mb_str = fmt(total_fused_mb, precision=1, commas=False)
bandwidth_reduction_pct_str = fmt(bandwidth_reduction_pct, precision=0, commas=False)
kernels_unfused_str = fmt(kernels_unfused, precision=0, commas=False)
kernels_fused_str = fmt(kernels_fused, precision=0, commas=False)
saved_latency_ms_str = fmt(saved_latency_ms, precision=0, commas=False)
unfused_time_us_str = fmt(unfused_time_us, precision=0, commas=False)
fused_time_us_str = fmt(fused_time_us, precision=1, commas=False)
fusion_speedup_str = fmt(fusion_speedup, precision=2, commas=False)
# ┌── EXPORTS (Bridge to Text) ─────────────────────────────────────────────────
conv_bn_relu_intermediate_mb_str = FusionCalc.conv_bn_relu_intermediate_mb_str
gemm_intermediate_mb_str = FusionCalc.gemm_intermediate_mb_str
feat_map_kb_str = FusionCalc.feat_map_kb_str
weights_mb_str = FusionCalc.weights_mb_str
bn_params_kb_str = FusionCalc.bn_params_kb_str
unfused_conv_mb_str = FusionCalc.unfused_conv_mb_str
unfused_bn_mb_str = FusionCalc.unfused_bn_mb_str
unfused_relu_mb_str = FusionCalc.unfused_relu_mb_str
total_unfused_mb_str = FusionCalc.total_unfused_mb_str
total_fused_mb_str = FusionCalc.total_fused_mb_str
bandwidth_reduction_pct_str = FusionCalc.bandwidth_reduction_pct_str
kernels_unfused_str = FusionCalc.kernels_unfused_str
kernels_fused_str = FusionCalc.kernels_fused_str
saved_latency_ms_str = FusionCalc.saved_latency_ms_str
unfused_time_us_str = FusionCalc.unfused_time_us_str
fused_time_us_str = FusionCalc.fused_time_us_str
fusion_speedup_str = FusionCalc.fusion_speedup_str
```
#### Operator Fusion {#sec-model-compression-operator-fusion-ac1d}
@@ -4594,16 +4687,28 @@ def conv_bn_relu_fused(input, weight, gamma, beta, mean, var):
# └─────────────────────────────────────────────────────────────────────────────
from mlsys.formatting import fmt, check, md_math
# --- Inputs (transfer counts) ---
unfused_transfers_value = 6 # read/write for each of conv, BN, ReLU
fused_transfers_value = 2 # read input, write output
class ConvFusionCalc:
"""Demonstrate 3x memory traffic reduction from Conv-BN-ReLU fusion (6 transfers → 2)."""
# --- Process ---
transfer_reduction_value = unfused_transfers_value / fused_transfers_value
# ┌── 1. PARAMETERS (Inputs) ──────────────────────────────────────────────
unfused_transfers = 6 # read/write for Conv, BN, ReLU
fused_transfers = 2 # read input, write output
# --- Outputs (formatted strings for prose) ---
transfer_reduction_str = fmt(transfer_reduction_value, precision=0, commas=False)
conv_bn_relu_mem_md = md_math(f"2 \\times 256 \\times 28 \\times 28 \\times 4 \\text{{ bytes}} \\approx \\text{{{conv_bn_relu_intermediate_mb_str} MB}}")
# ┌── 2. CALCULATION (The Physics) ────────────────────────────────────────
transfer_reduction = unfused_transfers / fused_transfers
# ┌── 3. INVARIANTS (Guardrails) ──────────────────────────────────────────
check(transfer_reduction == 3, "Conv-BN-ReLU fusion must yield exactly 3x transfer reduction.")
# ┌── 4. OUTPUTS (Formatting) ─────────────────────────────────────────────
transfer_reduction_str = fmt(transfer_reduction, precision=0, commas=False)
conv_bn_relu_mem_md = md_math(
f"2 \\times 256 \\times 28 \\times 28 \\times 4 \\text{{ bytes}} \\approx \\text{{{conv_bn_relu_intermediate_mb_str} MB}}"
)
# ┌── EXPORTS (Bridge to Text) ─────────────────────────────────────────────────
transfer_reduction_str = ConvFusionCalc.transfer_reduction_str
conv_bn_relu_mem_md = ConvFusionCalc.conv_bn_relu_mem_md
```
The arithmetic operations remain identical, but memory traffic drops from 6 transfers to 2 transfers (`{python} transfer_reduction_str` $\times$ reduction). For a ResNet-50 layer with 256 channels and spatial size $28 \times 28$, this eliminates `{python} conv_bn_relu_mem_md` of intermediate memory traffic per layer.
@@ -6276,7 +6381,6 @@ Unlike software functions that compose predictably, optimization techniques inte
With the three optimization dimensions now fully explored, practitioners need systematic guidance for translating this knowledge into deployment decisions.
## Technique Selection {#sec-model-compression-technique-selection-ba16}
An engineer deploying a transformer model faces a concrete decision: the model exceeds the target device's memory by 3 $\times$, inference latency is 4 $\times$ above the SLO, and the power budget allows no more than 2 W sustained. Should she quantize first, prune first, distill to a smaller architecture, or combine techniques? The answer depends on which constraint is binding, what accuracy loss is tolerable, and how much engineering time is available. This section provides structured guidance for navigating that decision.
@@ -6314,7 +6418,6 @@ These choices also depend on the available engineering budget. When fine-tuning
This decision framework provides starting points for individual technique selection. Validating that a chosen technique actually achieves its intended goal requires systematic profiling and measurement, which @sec-model-compression-efficiency-measurement-2424 formalizes in detail. However, production deployments rarely rely on a single technique. Combining pruning with quantization, or distillation with hardware-aware design, introduces interaction effects that can either amplify benefits or create unexpected accuracy degradation. The following section addresses how to sequence and combine techniques effectively.
## Optimization Strategies {#sec-model-compression-optimization-strategies-f2f6}
The decision framework above guides individual technique selection, but the largest optimization gains emerge from combining multiple techniques. Because pruning, quantization, and architectural efficiency operate at different levels of the stack, they provide multiplicative benefits when sequenced appropriately.
@@ -6528,7 +6631,6 @@ This example illustrates why sequencing matters: pruning first concentrates impo
With dozens of techniques across three optimization dimensions, rigorous measurement is essential for validating that optimizations achieve their intended goals. A practitioner who prunes, quantizes, and fuses without profiling the actual impact on target hardware is optimizing blindly.
## Efficiency Measurement {#sec-model-compression-efficiency-measurement-2424}
A model quantized to INT8 should be 4 $\times$ smaller and roughly 3 $\times$ faster, but does it actually achieve those gains on the target hardware? Theoretical compression ratios and measured deployment improvements often diverge, sometimes dramatically, because real speedups depend on memory hierarchy effects, kernel implementations, and hardware utilization patterns that theory alone cannot predict. Translating theoretical compression ratios into measurable deployment improvements therefore requires systematic profiling and evaluation.
@@ -6566,7 +6668,6 @@ With these comprehensive baselines in place, the measurement framework must trac
Rigorous measurement tells practitioners *whether* their optimizations succeeded, but the measurements themselves require tooling to perform. Profiling, quantization, pruning, and deployment all depend on software frameworks that automate otherwise prohibitively complex workflows. We turn now to the implementation tools that make these techniques practical.
## Implementation Tools {#sec-model-compression-implementation-tools-4990}
Understanding optimization techniques is necessary but not sufficient; practical implementation relies on robust software support. Without framework tooling, quantization would require manual modification of model definitions and careful insertion of quantization operations throughout the network, while pruning would demand direct manipulation of weight tensors. Both become prohibitively complex as models scale.
@@ -6655,7 +6756,6 @@ Sparsity heat maps show sparsity distribution across layers (@fig-sparse-heat-ma
With the implementation tools and visualization capabilities established, the natural question is: how do these techniques compare when a practitioner must choose among them? Each optimization approach carries distinct trade-offs in accuracy, training cost, and hardware requirements, and a structured comparison clarifies which to reach for first.
## Technique Comparison {#sec-model-compression-technique-comparison-3142}
A comparative analysis across the three major approaches reveals how each addresses distinct aspects of the efficiency-accuracy trade-off. Pruning works best when sparse computation hardware is available and when reducing floating-point operations is critical. Quantization provides the most versatile approach with broad hardware support, making it ideal for diverse deployment scenarios. Knowledge distillation requires significant computational investment but produces consistently high-quality compressed models, making it the right choice when accuracy preservation is paramount. @tbl-optimization-comparison summarizes these trade-offs for systematic technique selection.
@@ -6673,7 +6773,6 @@ These techniques combine synergistically, with quantization often applied after
With the complete optimization toolkit now surveyed—from individual techniques through combination strategies—the most instructive lessons often come not from what works but from what fails. The following fallacies and pitfalls capture the most common mistakes engineers make when applying these techniques, each grounded in the quantitative trade-offs we have established throughout the chapter.
## Fallacies and Pitfalls {#sec-model-compression-fallacies-pitfalls-1b5e}
```{python}
@@ -6773,7 +6872,6 @@ Teams apply post-training quantization (PTQ) to avoid retraining and achieve 96.
Teams achieve `{python} int8_size_reduction_str` $\times$ model size reduction through INT8 quantization and expect `{python} int8_size_reduction_str` $\times$ memory savings in deployment. In practice, runtime overhead erodes compression gains. Dequantization kernels add `{python} dequant_overhead_str`% latency overhead converting INT8 weights back to FP16. Pruned models with irregular sparsity achieve only 12% latency reduction despite `{python} param_removal_str`% parameter removal because hardware cannot skip zeroed weights efficiently. As @sec-model-compression-profiling-opportunity-analysis-477f demonstrates, a BERT model pruned to 50% sparsity and quantized to INT8 achieves `{python} actual_speedup_str`% end-to-end speedup rather than the expected `{python} expected_speedup_str` $\times$, because unstructured sparsity creates irregular memory access. Production workflows must profile *deployed* latency on target hardware, not extrapolate from compression ratios.
## Summary {#sec-model-compression-summary-8229}
Model compression is not a bag of tricks but an engineering discipline built on three complementary dimensions: *structural optimization* determines what the model computes, *precision optimization* determines how precisely it computes, and *architectural optimization* determines how efficiently those computations execute on physical hardware. The most important lesson of this chapter is that these dimensions compose multiplicatively. Pruning alone might achieve 2 $\times$ compression; quantization alone might achieve 4 $\times$; but pruning, distillation, and quantization applied together can achieve 16 $\times$ — as BERT's compression from 440 MB to 28 MB demonstrates. The second lesson is equally important: theoretical compression ratios lie. A 4 $\times$ reduction in parameters translates to 4 $\times$ latency improvement only when the optimization aligns with the hardware's execution model. Unstructured sparsity on hardware that lacks sparse kernels achieves almost nothing; INT8 quantization on hardware without INT8 units achieves even less. Profile on target hardware, not paper metrics.

View File

@@ -21,6 +21,26 @@ When training throughput is low, check MFU, communication fraction, and goodput
```{python}
#| label: appendix-c3-setup
#| echo: false
# ┌─────────────────────────────────────────────────────────────────────────────
# │ C³ TAXONOMY — MASTER COMPUTATION
# ├─────────────────────────────────────────────────────────────────────────────
# │ Context: PERSISTENT — All values used throughout the C³ Taxonomy appendix:
# │ @tbl-c3-dam-mapping, @tbl-c3-diagnostic-summary, @tbl-c3-traffic-light,
# │ @tbl-c3-bottleneck-actions, three case studies, scorecard, and exercises.
# │
# │ Goal: Provide all C³ diagnostic constants — case study parameters, effective
# │ FLOPS decomposition, and threshold strings — for the fleet-scale
# │ bottleneck classification reference appendix.
# │ Show: See individual section prose for formatted values. This cell provides
# │ the physics; string attributes are display-ready.
# │ How: calc_effective_flops() with MFU, scaling efficiency, and goodput ratio;
# │ all results as raw floats extracted via .m_as() or .magnitude where unitless.
# │
# │ Imports: mlsys.constants (H100_FLOPS_FP16_TENSOR, MFU_*, SCALING_EFF_*, OVERHEAD_*, …)
# │ mlsys.formulas (calc_effective_flops)
# │ mlsys.formatting (fmt, check, md_math)
# │ Exports: C3 = C3Taxonomy (accessed as C3.attribute in downstream cells)
# └─────────────────────────────────────────────────────────────────────────────
import math
from mlsys.constants import (
@@ -35,15 +55,6 @@ from mlsys.constants import (
from mlsys.formatting import fmt, check, md_math
from mlsys.formulas import calc_effective_flops
# =============================================================================
# PURPOSE
# =============================================================================
# Purpose: Compute all values for the C³ Taxonomy appendix.
# Used in: Case studies, effective FLOPS, scorecard, and inline prose.
#
# Philosophy: C³ parallels D·A·M — three MECE axes for fleet-scale diagnosis.
# Every computed value traces back to constants.py.
class C3Taxonomy:
"""Namespace for C³ diagnostic examples."""
@@ -71,7 +82,7 @@ class C3Taxonomy:
case3_oh_maintenance_pct = OVERHEAD_MAINTENANCE * 100
# Effective FLOPS calculation: 100K GPU cluster
h100_tflops = H100_FLOPS_FP16_TENSOR.to(TFLOPs / second).magnitude
h100_tflops = H100_FLOPS_FP16_TENSOR.m_as(TFLOPs / second)
n_gpus_eff = 100_000
peak_pflops = n_gpus_eff * h100_tflops / 1000 # PFLOPs
goodput_all = 1.0 - (OVERHEAD_PIPELINE_BUBBLE +
@@ -80,7 +91,7 @@ class C3Taxonomy:
OVERHEAD_MAINTENANCE)
effective_pflops = calc_effective_flops(
peak_pflops, MFU_TRAINING_HIGH, SCALING_EFF_8192GPU, goodput_all
)
).magnitude # extract float; calc_effective_flops returns Quantity since formulas.py upgrade
c3_tax = peak_pflops / effective_pflops
eff_fraction = effective_pflops / peak_pflops
@@ -445,12 +456,8 @@ The gap between scaling-law predictions and observed training outcomes is, in la
```{python}
#| label: appendix-c3-effective-flops
#| echo: false
# =============================================================================
# PURPOSE
# =============================================================================
# Purpose: Format effective FLOPS values for the worked example.
# Used in: Effective FLOPS worked example prose.
# Goal: Alias C3Taxonomy strings for the 100K-GPU effective FLOPS callout prose.
# Exports: peak_str, eff_str, eff_pct_str, c3_tax_str, mfu_str, scaling_str, goodput_str
peak_str = C3.peak_pflops_str
eff_str = C3.effective_pflops_str

View File

@@ -15,6 +15,23 @@ This appendix collects the reference numbers and compact models for fleet-scale
```{python}
#| label: appendix-fleet-setup
#| echo: false
# ┌─────────────────────────────────────────────────────────────────────────────
# │ FLEET FOUNDATIONS — MASTER COMPUTATION
# ├─────────────────────────────────────────────────────────────────────────────
# │ Context: PERSISTENT — All values used throughout the Fleet Foundations
# │ appendix: hardware reference table, MTBF tables, checkpoint sizing,
# │ effective FLOPS, comm-compute ratio, and all prose inline values.
# │
# │ Goal: Provide all quantitative fleet engineering constants in one place
# │ for the "Numbers Every Fleet Engineer Should Know" reference appendix.
# │ Show: See individual section cells for formatted values. This cell provides
# │ the physics; formatting cells convert to display strings.
# │ How: pint Quantities from mlsys.constants; fleet formulas from formulas.py;
# │ all results as typed Quantities or raw floats via .m_as().
# │
# │ Imports: mlsys.constants (*), mlsys.formulas (calc_*), mlsys.formatting (fmt, check)
# │ Exports: FF = FleetFoundations (accessed as FF.attribute in downstream cells)
# └─────────────────────────────────────────────────────────────────────────────
import math
from mlsys.constants import *
@@ -26,27 +43,13 @@ from mlsys.formulas import (
calc_young_daly_interval, calc_checkpoint_size
)
# =============================================================================
# PURPOSE
# =============================================================================
# Purpose: Compute all values for the Fleet Foundations appendix.
# Used in: Reference tables, worked examples, and inline prose throughout.
#
# Philosophy: Fleet-scale numbers emphasize RATIOS between tiers and
# SCALING BEHAVIOR with cluster size. Absolute values are
# current-generation snapshots; ratios persist across generations.
# =============================================================================
# NETWORK HIERARCHY
# =============================================================================
class FleetFoundations:
"""Namespace for fleet-scale reference calculations."""
# ── Communication Numbers ────────────────────────────────────────────────
# Bandwidth hierarchy (GB/s)
nvlink_h100_bw = int(NVLINK_H100_BW.to(GB / second).magnitude)
pcie5_bw = int(PCIE_GEN5_BW.to(GB / second).magnitude)
nvlink_h100_bw = int(NVLINK_H100_BW.m_as(GB / second))
pcie5_bw = int(PCIE_GEN5_BW.m_as(GB / second))
ib_ndr_bw = INFINIBAND_NDR_BW_GBS
ib_hdr_bw = INFINIBAND_HDR_BW_GBS
ib_xdr_bw = INFINIBAND_XDR_BW_GBS
@@ -95,28 +98,29 @@ class FleetFoundations:
mtbf_100k_h = calc_mtbf_cluster(GPU_MTTF_HOURS, cl_mega)
# Convert to minutes for readability
mtbf_256_min = mtbf_256_h * 60
mtbf_2048_min = mtbf_2048_h * 60
mtbf_8192_min = mtbf_8192_h * 60
mtbf_100k_min = mtbf_100k_h * 60
mtbf_256_min = mtbf_256_h.m_as(ureg.minute)
mtbf_2048_min = mtbf_2048_h.m_as(ureg.minute)
mtbf_8192_min = mtbf_8192_h.m_as(ureg.minute)
mtbf_100k_min = mtbf_100k_h.m_as(ureg.minute)
# Failure probability for a 24-hour job (using hours consistently)
pfail_256_24h = calc_failure_probability(mtbf_256_h, 24)
pfail_2048_24h = calc_failure_probability(mtbf_2048_h, 24)
pfail_8192_24h = calc_failure_probability(mtbf_8192_h, 24)
pfail_100k_24h = calc_failure_probability(mtbf_100k_h, 24)
# Failure probability for a 24-hour job
_24h = 24 * ureg.hour
pfail_256_24h = calc_failure_probability(mtbf_256_h, _24h)
pfail_2048_24h = calc_failure_probability(mtbf_2048_h, _24h)
pfail_8192_24h = calc_failure_probability(mtbf_8192_h, _24h)
pfail_100k_24h = calc_failure_probability(mtbf_100k_h, _24h)
# Checkpoint sizes (bytes)
ckpt_7b = calc_checkpoint_size(7e9)
# Checkpoint sizes
ckpt_7b = calc_checkpoint_size(7e9) # Quantity[byte]
ckpt_70b = calc_checkpoint_size(70e9)
ckpt_175b = calc_checkpoint_size(175e9)
ckpt_1t = calc_checkpoint_size(1e12)
# Convert to GB
ckpt_7b_gb = ckpt_7b / 1e9
ckpt_70b_gb = ckpt_70b / 1e9
ckpt_175b_gb = ckpt_175b / 1e9
ckpt_1t_tb = ckpt_1t / 1e12
# Extract in GB/TB
ckpt_7b_gb = ckpt_7b.m_as(GB)
ckpt_70b_gb = ckpt_70b.m_as(GB)
ckpt_175b_gb = ckpt_175b.m_as(GB)
ckpt_1t_tb = ckpt_1t.m_as(TB)
# Overhead budgets
oh_pipeline = int(OVERHEAD_PIPELINE_BUBBLE * 100)
@@ -125,20 +129,20 @@ class FleetFoundations:
oh_maintenance = int(OVERHEAD_MAINTENANCE * 100)
# ── Hardware Reference ───────────────────────────────────────────────────
h100_flops = int(H100_FLOPS_FP16_TENSOR.to(TFLOPs / second).magnitude)
h100_bw_tbs = f"{H100_MEM_BW.to(TB / second).magnitude:.2f}"
h100_cap = int(H100_MEM_CAPACITY.to(GiB).magnitude)
h100_tdp = int(H100_TDP.magnitude)
h100_flops = int(H100_FLOPS_FP16_TENSOR.m_as(TFLOPs / second))
h100_bw_tbs = f"{H100_MEM_BW.m_as(TB / second):.2f}"
h100_cap = int(H100_MEM_CAPACITY.m_as(GiB))
h100_tdp = int(H100_TDP.m_as(watt))
b200_flops = int(B200_FLOPS_FP16_TENSOR.to(TFLOPs / second).magnitude)
b200_bw_tbs = f"{B200_MEM_BW.to(TB / second).magnitude:.0f}"
b200_cap = int(B200_MEM_CAPACITY.to(GiB).magnitude)
b200_tdp = int(B200_TDP.magnitude)
b200_flops = int(B200_FLOPS_FP16_TENSOR.m_as(TFLOPs / second))
b200_bw_tbs = f"{B200_MEM_BW.m_as(TB / second):.0f}"
b200_cap = int(B200_MEM_CAPACITY.m_as(GiB))
b200_tdp = int(B200_TDP.m_as(watt))
tpuv5_flops = int(TPUV5P_FLOPS_BF16.to(TFLOPs / second).magnitude)
tpuv5_bw_tbs = f"{TPUV5P_MEM_BW.to(TB / second).magnitude:.2f}"
tpuv5_cap = int(TPUV5P_MEM_CAPACITY.to(GiB).magnitude)
tpuv5_ici = int(TPUV5P_ICI_BW.to(GB / second).magnitude)
tpuv5_flops = int(TPUV5P_FLOPS_BF16.m_as(TFLOPs / second))
tpuv5_bw_tbs = f"{TPUV5P_MEM_BW.m_as(TB / second):.2f}"
tpuv5_cap = int(TPUV5P_MEM_CAPACITY.m_as(GiB))
tpuv5_ici = int(TPUV5P_ICI_BW.m_as(GB / second))
# ── Power and Sustainability ─────────────────────────────────────────────
rack_trad = RACK_POWER_TRADITIONAL_KW
@@ -154,17 +158,19 @@ class FleetFoundations:
# ── Effective FLOPS Example ──────────────────────────────────────────────
# 1024-GPU cluster, H100, realistic overheads
peak_1024 = 1024 * H100_FLOPS_FP16_TENSOR.to(TFLOPs / second).magnitude
_peak_1024_qty = 1024 * H100_FLOPS_FP16_TENSOR # Quantity[TFLOPs/s]
peak_1024 = _peak_1024_qty.m_as(TFLOPs / second) # raw float for display
goodput_ratio = 1.0 - (OVERHEAD_PIPELINE_BUBBLE +
OVERHEAD_CHECKPOINT +
OVERHEAD_FAILURE_RECOVERY +
OVERHEAD_MAINTENANCE)
eff_flops_1024 = calc_effective_flops(
peak_1024,
_eff_flops_1024_qty = calc_effective_flops(
_peak_1024_qty,
MFU_TRAINING_HIGH,
SCALING_EFF_1024GPU,
goodput_ratio
)
) # Quantity[flop/second]
eff_flops_1024 = _eff_flops_1024_qty.m_as(TFLOPs / second) # raw float for display
eff_fraction = eff_flops_1024 / peak_1024
# ── Invariant Checks ─────────────────────────────────────────────────────
@@ -289,12 +295,8 @@ Communication defines the boundaries of parallelism. These tables quantify the b
```{python}
#| label: fleet-comm-numbers
#| echo: false
# =============================================================================
# PURPOSE
# =============================================================================
# Purpose: Compute communication hierarchy values for inline references.
# Used in: Communication numbers tables and prose.
# Goal: Format communication bandwidth and latency strings for @tbl-fleet-bandwidth-hierarchy and @tbl-fleet-latency-hierarchy.
# Exports: nvlink_bw_str, pcie5_bw_str, ib_*_str, tpuv5_ici_str, nvlink_to_ib_str, *_lat_str
# ── Bandwidth ratios ────────────────────────────────────────────────────────
nvlink_bw_str = fmt(FF.nvlink_h100_bw, precision=0)
@@ -386,12 +388,8 @@ At fleet scale, coordination---failure recovery, checkpointing, and maintenance-
```{python}
#| label: fleet-mtbf-table
#| echo: false
# =============================================================================
# PURPOSE
# =============================================================================
# Purpose: Format MTBF and failure probability values for the table.
# Used in: MTBF by cluster size table.
# Goal: Format MTBF hours, minutes, and P(failure) percentages for @tbl-fleet-mtbf.
# Exports: mtbf_256_str, mtbf_2048_str, mtbf_8192_str, mtbf_100k_str, mtbf_*_min_str, pfail_*_str
mtbf_256_str = fmt(FF.mtbf_256_h, precision=1, commas=False)
mtbf_2048_str = fmt(FF.mtbf_2048_h, precision=1, commas=False)
@@ -432,12 +430,8 @@ Checkpointing is the primary recovery mechanism, and its cost depends on the mod
```{python}
#| label: fleet-checkpoint-sizes
#| echo: false
# =============================================================================
# PURPOSE
# =============================================================================
# Purpose: Format checkpoint sizes for the reference table.
# Used in: Checkpoint size table.
# Goal: Format checkpoint sizes in GB/TB for @tbl-fleet-checkpoint-sizes.
# Exports: ckpt_7b_str, ckpt_70b_str, ckpt_175b_str, ckpt_1t_str
ckpt_7b_str = fmt(FF.ckpt_7b_gb, precision=0)
ckpt_70b_str = fmt(FF.ckpt_70b_gb, precision=0)
@@ -484,12 +478,8 @@ These numbers reflect the current generation of fleet-scale hardware. Use them f
```{python}
#| label: fleet-hardware-ref
#| echo: false
# =============================================================================
# PURPOSE
# =============================================================================
# Purpose: Format hardware reference values for the comparison table.
# Used in: Current hardware reference table.
# Goal: Format H100, B200, and TPU v5p specs for @tbl-fleet-hardware-ref.
# Exports: h100_flops_str, h100_bw_str, h100_cap_str, h100_tdp_str, b200_*, tpuv5_*
h100_flops_str = fmt(FF.h100_flops, precision=0)
h100_bw_str = FF.h100_bw_tbs
@@ -547,36 +537,52 @@ Volume I introduced Amdahl's Law for a single machine, where the serial fraction
```{python}
#| label: fleet-amdahl-example
#| echo: false
# ┌─────────────────────────────────────────────────────────────────────────────
# │ FLEET AMDAHL EXAMPLE
# ├─────────────────────────────────────────────────────────────────────────────
# │ Context: @sec-fleet-foundations-amdahls-fleet worked example
# │
# │ Goal: Compute Amdahl's Law speedup at 32/256/1024/8192 GPUs for 10% serial fraction.
# │ Show: Speedup values and the Amdahl ceiling for inline prose.
# │ How: calc_amdahls_speedup() from formulas.py; check() for invariants.
# │
# │ Imports: mlsys.formulas (calc_amdahls_speedup), mlsys.formatting (fmt, check)
# │ Exports: s_fleet_pct_str, max_speedup_str, su_32_str, su_256_str, su_1024_str, su_8192_str
# └─────────────────────────────────────────────────────────────────────────────
# =============================================================================
# PURPOSE
# =============================================================================
# Purpose: Compute Amdahl's Law examples at fleet scale.
# Used in: Amdahl's Law at Fleet Scale worked example.
class FleetAmdahlExample:
"""Amdahl's Law speedup at 32/256/1024/8192 GPUs for 10% serial fraction."""
# ── PARAMETERS ──────────────────────────────────────────────────────────────
s_fleet = 0.10 # 10% serial fraction (communication + sync)
n_values = [32, 256, 1024, 8192]
# ── 1. PARAMETERS (Inputs) ──────────────────────────────────────────────
s_fleet = 0.10
n_values = [32, 256, 1024, 8192]
# ── CALCULATION ─────────────────────────────────────────────────────────────
speedups = {}
for n in n_values:
su = calc_amdahls_speedup(1 - s_fleet, n)
speedups[n] = su
# ── 2. CALCULATION (The Physics) ────────────────────────────────────────
speedups = {}
for _n in n_values:
speedups[_n] = calc_amdahls_speedup(1 - s_fleet, _n)
max_speedup = 1 / s_fleet
max_speedup = 1 / s_fleet
# ── INVARIANTS ──────────────────────────────────────────────────────────────
check(speedups[8192] < max_speedup, "Speedup must be below Amdahl limit")
check(max_speedup == 10.0, "Max speedup for 10% serial fraction must be 10x")
# ── 3. INVARIANTS (Guardrails) ──────────────────────────────────────────
check(speedups[8192] < max_speedup, "Speedup must be below Amdahl limit")
check(max_speedup == 10.0, "Max speedup for 10% serial fraction must be 10x")
# ── OUTPUTS ─────────────────────────────────────────────────────────────────
s_fleet_pct_str = "10"
max_speedup_str = fmt(max_speedup, precision=0, commas=False)
su_32_str = fmt(speedups[32], precision=1, commas=False)
su_256_str = fmt(speedups[256], precision=1, commas=False)
su_1024_str = fmt(speedups[1024], precision=1, commas=False)
su_8192_str = fmt(speedups[8192], precision=1, commas=False)
# ── 4. OUTPUTS (Formatting) ─────────────────────────────────────────────
s_fleet_pct_str = "10"
max_speedup_str = fmt(max_speedup, precision=0, commas=False)
su_32_str = fmt(speedups[32], precision=1, commas=False)
su_256_str = fmt(speedups[256], precision=1, commas=False)
su_1024_str = fmt(speedups[1024], precision=1, commas=False)
su_8192_str = fmt(speedups[8192], precision=1, commas=False)
# ┌── EXPORTS (Bridge to Text) ─────────────────────────────────────────────────
s_fleet_pct_str = FleetAmdahlExample.s_fleet_pct_str
max_speedup_str = FleetAmdahlExample.max_speedup_str
su_32_str = FleetAmdahlExample.su_32_str
su_256_str = FleetAmdahlExample.su_256_str
su_1024_str = FleetAmdahlExample.su_1024_str
su_8192_str = FleetAmdahlExample.su_8192_str
```
To see the fleet-scale implications, consider a training workload where `{python} s_fleet_pct_str`% of wall-clock time is spent in synchronization, communication, and other serial overhead. Amdahl's Law gives the following speedups:
@@ -604,58 +610,72 @@ When $\rho < 1$, computation dominates and communication can be overlapped. When
```{python}
#| label: fleet-comm-comp-ratio
#| echo: false
# ┌─────────────────────────────────────────────────────────────────────────────
# │ FLEET COMM-COMPUTE RATIO
# ├─────────────────────────────────────────────────────────────────────────────
# │ Context: @sec-fleet-foundations-comm-compute-ratio worked example (@tbl-fleet-comm-comp)
# │
# │ Goal: Compute ρ = T_comm / T_comp for 3 scenarios: 7B DP, 350M DP, tensor-parallel.
# │ Show: AllReduce times in ms and ρ ratios for each scenario; ~0.1 for DP 7B, ~3 for DP 350M.
# │ How: calc_ring_allreduce_time() with IB NDR params; NVLink BW for tensor-parallel.
# │
# │ Imports: mlsys.constants (INFINIBAND_NDR_BW_GBS, IB_NDR_LATENCY_US, NVLINK_H100_BW, GB, second)
# │ Exports: ar_7b_ms_str, rho_7b_str, ar_350m_ms_str, rho_350m_str, rho_tp_str
# └─────────────────────────────────────────────────────────────────────────────
# =============================================================================
# PURPOSE
# =============================================================================
# Purpose: Compute communication-computation ratios for different scenarios.
# Used in: Communication-computation ratio worked example.
class FleetCommCompRatio:
"""Communication-to-computation ratio ρ for three parallelism scenarios."""
# ── SCENARIO 1: Data parallelism, large model ──────────────────────────────
# 7B model, 256 GPUs, IB NDR
grad_bytes_7b = 7e9 * 2 # 7B params * 2 bytes (BF16 gradients)
allreduce_time_7b = calc_ring_allreduce_time(
message_bytes=grad_bytes_7b,
n_gpus=256,
bandwidth_bytes_s=INFINIBAND_NDR_BW_GBS * 1e9,
latency_s=IB_NDR_LATENCY_US * 1e-6
)
# ── SCENARIO 1: Data parallelism, large model ──────────────────────────
# 7B model, 256 GPUs, IB NDR
grad_bytes_7b = 7e9 * 2 # 7B params * 2 bytes (BF16 gradients)
allreduce_time_7b = calc_ring_allreduce_time(
message_bytes=grad_bytes_7b,
n_gpus=256,
bandwidth_bytes_s=INFINIBAND_NDR_BW_GBS * 1e9,
latency_s=IB_NDR_LATENCY_US * 1e-6
) # Quantity[second]
# Computation time: assume ~50ms forward+backward per step
comp_time_7b = 0.050 # 50 ms
rho_7b = allreduce_time_7b / comp_time_7b
comp_time_7b = 0.050 # 50 ms (seconds)
rho_7b = allreduce_time_7b.m_as(ureg.second) / comp_time_7b
# ── SCENARIO 2: Data parallelism, small model ──────────────────────────────
# 350M model, 256 GPUs, IB NDR
grad_bytes_350m = 350e6 * 2 # 350M params * 2 bytes
allreduce_time_350m = calc_ring_allreduce_time(
message_bytes=grad_bytes_350m,
n_gpus=256,
bandwidth_bytes_s=INFINIBAND_NDR_BW_GBS * 1e9,
latency_s=IB_NDR_LATENCY_US * 1e-6
)
comp_time_350m = 0.005 # 5 ms (smaller model)
rho_350m = allreduce_time_350m / comp_time_350m
# ── SCENARIO 2: Data parallelism, small model ──────────────────────────
# 350M model, 256 GPUs, IB NDR
grad_bytes_350m = 350e6 * 2 # 350M params * 2 bytes
allreduce_time_350m = calc_ring_allreduce_time(
message_bytes=grad_bytes_350m,
n_gpus=256,
bandwidth_bytes_s=INFINIBAND_NDR_BW_GBS * 1e9,
latency_s=IB_NDR_LATENCY_US * 1e-6
) # Quantity[second]
comp_time_350m = 0.005 # 5 ms (seconds, smaller model)
rho_350m = allreduce_time_350m.m_as(ureg.second) / comp_time_350m
# ── SCENARIO 3: Tensor parallelism, within node ────────────────────────────
# Activation transfer: 8 GPUs, NVLink, ~16 MB per layer
act_bytes = 16e6 # 16 MB
act_transfer_time = act_bytes / (NVLINK_H100_BW.to(GB / second).magnitude * 1e9)
comp_time_layer = 0.001 # 1 ms per layer
rho_tp = act_transfer_time / comp_time_layer
# ── SCENARIO 3: Tensor parallelism, within node ────────────────────────
# Activation transfer: 8 GPUs, NVLink, ~16 MB per layer
act_bytes = 16e6 # 16 MB
act_transfer_time = act_bytes / (NVLINK_H100_BW.m_as(GB / second) * 1e9)
comp_time_layer = 0.001 # 1 ms per layer
rho_tp = act_transfer_time / comp_time_layer
# ── INVARIANTS ──────────────────────────────────────────────────────────────
check(rho_7b > 0.1, "7B comm ratio must be non-trivial")
check(rho_350m > 0.01, "350M comm ratio must be non-trivial")
# ── INVARIANTS ──────────────────────────────────────────────────────────
check(rho_7b > 0.1, "7B comm ratio must be non-trivial")
check(rho_350m > 0.01, "350M comm ratio must be non-trivial")
# ── OUTPUTS ─────────────────────────────────────────────────────────────────
ar_7b_ms_str = fmt(allreduce_time_7b * 1000, precision=1, commas=False)
rho_7b_str = fmt(rho_7b, precision=2, commas=False)
# ── OUTPUTS ─────────────────────────────────────────────────────────────
ar_7b_ms_str = fmt(allreduce_time_7b.m_as(ureg.millisecond), precision=1, commas=False)
rho_7b_str = fmt(rho_7b, precision=2, commas=False)
ar_350m_ms_str = fmt(allreduce_time_350m.m_as(ureg.millisecond), precision=1, commas=False)
rho_350m_str = fmt(rho_350m, precision=1, commas=False)
rho_tp_str = fmt(rho_tp, precision=3, commas=False)
ar_350m_ms_str = fmt(allreduce_time_350m * 1000, precision=1, commas=False)
rho_350m_str = fmt(rho_350m, precision=1, commas=False)
rho_tp_str = fmt(rho_tp, precision=3, commas=False)
# ┌── EXPORTS (Bridge to Text) ─────────────────────────────────────────────────
ar_7b_ms_str = FleetCommCompRatio.ar_7b_ms_str
rho_7b_str = FleetCommCompRatio.rho_7b_str
rho_7b = FleetCommCompRatio.rho_7b # raw float used in fmt() call in prose
ar_350m_ms_str = FleetCommCompRatio.ar_350m_ms_str
rho_350m_str = FleetCommCompRatio.rho_350m_str
rho_tp_str = FleetCommCompRatio.rho_tp_str
```
@tbl-fleet-comm-comp shows the ratio for three representative scenarios. The contrast between them reveals why parallelism strategy must match the workload.
@@ -685,12 +705,8 @@ The key insight for fleet-scale ML is that weak scaling is not just a mathematic
```{python}
#| label: fleet-effective-flops
#| echo: false
# =============================================================================
# PURPOSE
# =============================================================================
# Purpose: Compute effective FLOPS for the compound loss example.
# Used in: Effective FLOPS worked example.
# Goal: Format peak and effective FLOPS for the 1,024-GPU compound loss callout.
# Exports: peak_str, eff_str, eff_pct_str, goodput_pct_str, mfu_pct_str, scaling_pct_str
peak_str = fmt(FF.peak_1024, precision=0)
eff_str = fmt(FF.eff_flops_1024, precision=0)

View File

@@ -35,6 +35,28 @@ This appendix is designed as a *reference*. Use it when you need to move from in
```{python}
#| label: appendix-reliability-setup
#| echo: false
# ┌─────────────────────────────────────────────────────────────────────────────
# │ RELIABILITY FOUNDATIONS — MASTER COMPUTATION
# ├─────────────────────────────────────────────────────────────────────────────
# │ Context: PERSISTENT — All values used throughout the Reliability Foundations
# │ appendix: @tbl-component-fit, @tbl-mtbf-cluster, @tbl-failure-prob,
# │ @tbl-checkpoint-size, @tbl-recovery-anatomy, @tbl-strategy-comparison,
# │ @tbl-availability-stacking, and all Young-Daly worked examples.
# │
# │ Goal: Provide all reliability constants — FIT rates, MTBF cascade, Young-Daly
# │ optimal checkpoint interval, recovery anatomy, and availability stacking —
# │ for the "Failure as a Physical Constraint" reference appendix.
# │ Show: See individual section cells for formatted values. This cell provides
# │ the physics; formatting cells and f-strings convert to display strings.
# │ How: pint Quantities from mlsys.constants; calc_mtbf_node, calc_mtbf_cluster,
# │ calc_young_daly_interval, calc_failure_probability, calc_checkpoint_size,
# │ calc_availability_stacked from formulas.py; all extractions via .m_as().
# │
# │ Imports: mlsys.constants (*), mlsys.formulas (calc_mtbf_*, calc_young_daly_interval,
# │ calc_failure_probability, calc_checkpoint_size, calc_availability_stacked)
# │ mlsys.formatting (fmt, check)
# │ Exports: R = ReliabilityFoundations (accessed as R.attribute in downstream cells)
# └─────────────────────────────────────────────────────────────────────────────
from mlsys.constants import *
from mlsys.formatting import fmt, check
@@ -103,8 +125,9 @@ class ReliabilityFoundations:
@classmethod
def p_failure(cls, n_gpus, duration_hours):
mtbf_h = cls.cluster_mtbf(n_gpus)
return calc_failure_probability(mtbf_h, duration_hours)
mtbf_h = cls.cluster_mtbf(n_gpus) # Quantity[hour]
dur_h = duration_hours * ureg.hour # attach unit
return calc_failure_probability(mtbf_h, dur_h)
# ┌── 5. CHECKPOINT SIZING ────────────────────────────────────────
# Mixed-precision Adam: 16 bytes/param
@@ -114,25 +137,28 @@ class ReliabilityFoundations:
@classmethod
def ckpt_size_gb(cls, n_params):
return calc_checkpoint_size(n_params, cls.bytes_per_param) / 1e9
return calc_checkpoint_size(n_params, cls.bytes_per_param).m_as(GB)
# ┌── 6. YOUNG-DALY (10K cluster, 175B model) ────────────────────
ckpt_175b_bytes = calc_checkpoint_size(175e9, 16)
ckpt_175b_gb = ckpt_175b_bytes / 1e9
ckpt_write_bw = CHECKPOINT_WRITE_BW_GBS # GB/s
ckpt_write_time_s = ckpt_175b_gb / ckpt_write_bw
ckpt_175b_bytes = calc_checkpoint_size(175e9, 16) # Quantity[byte]
ckpt_175b_gb = ckpt_175b_bytes.m_as(GB) # raw float in GB
ckpt_write_bw = CHECKPOINT_WRITE_BW_GBS # GB/s (raw float)
ckpt_write_time_s = ckpt_175b_gb / ckpt_write_bw # raw float (seconds)
cluster_mtbf_10k_s = cluster_mtbf_10k * SEC_PER_HOUR
tau_opt_s = calc_young_daly_interval(ckpt_write_time_s, cluster_mtbf_10k_s)
tau_opt_min = tau_opt_s / SECONDS_PER_MINUTE
cluster_mtbf_10k_s = cluster_mtbf_10k.m_as(ureg.second) # raw float (seconds)
tau_opt_s = calc_young_daly_interval(ckpt_write_time_s, cluster_mtbf_10k_s) # Quantity[second]
tau_opt_min = tau_opt_s.m_as(ureg.minute) # raw float in minutes
# ┌── 7. RECOVERY TIME ───────────────────────────────────────────
t_detect = HEARTBEAT_TIMEOUT_S
t_reschedule = RESCHEDULE_TIME_S
t_reload_s = ckpt_write_time_s # same BW, same size
t_detect = HEARTBEAT_TIMEOUT_S # raw float (seconds) — kept for table display
t_reschedule = RESCHEDULE_TIME_S # raw float (seconds) — kept for table display
t_reload_s = ckpt_write_time_s # raw float (seconds)
# Replay: half the interval on average
t_replay_s = tau_opt_s / 2
t_recovery_total_s = t_detect + t_reschedule + t_reload_s + t_replay_s
t_replay_s = tau_opt_s / 2 # Quantity[second]
# Sum: attach units to raw seconds, then extract in minutes
t_recovery_total_s = (
(t_detect + t_reschedule + t_reload_s) * ureg.second + t_replay_s
).m_as(ureg.minute) # raw float in minutes
# ┌── 8. GOODPUT ─────────────────────────────────────────────────
overhead_ckpt = OVERHEAD_CHECKPOINT
@@ -150,8 +176,8 @@ class ReliabilityFoundations:
R = ReliabilityFoundations # short alias for inline use
# ┌── INVARIANTS ──────────────────────────────────────────────────────
check(R.cluster_mtbf_10k < 5.0,
f"10K cluster MTBF should be < 5 hours, got {R.cluster_mtbf_10k:.2f}")
check(R.cluster_mtbf_10k.m_as(ureg.hour) < 5.0,
f"10K cluster MTBF should be < 5 hours, got {R.cluster_mtbf_10k.m_as(ureg.hour):.2f}")
check(R.tau_opt_min > 5 and R.tau_opt_min < 60,
f"Young-Daly interval should be 5-60 min, got {R.tau_opt_min:.1f}")
check(R.p_failure(10_000, 24) > 0.99,
@@ -159,12 +185,12 @@ check(R.p_failure(10_000, 24) > 0.99,
# ┌── FORMATTED OUTPUTS ──────────────────────────────────────────────
gpu_mttf_str = fmt(R.gpu_mttf, precision=0)
node_mtbf_str = fmt(R.node_mtbf, precision=0)
cluster_mtbf_10k_str = fmt(R.cluster_mtbf_10k, precision=2)
node_mtbf_str = fmt(R.node_mtbf.m_as(ureg.hour), precision=0)
cluster_mtbf_10k_str = fmt(R.cluster_mtbf_10k.m_as(ureg.hour), precision=2)
tau_opt_min_str = fmt(R.tau_opt_min, precision=1)
ckpt_175b_gb_str = fmt(R.ckpt_175b_gb, precision=0)
ckpt_write_time_str = fmt(R.ckpt_write_time_s, precision=1)
t_recovery_str = fmt(R.t_recovery_total_s / SECONDS_PER_MINUTE, precision=1)
t_recovery_str = fmt(R.t_recovery_total_s, precision=1)
```
## Failure Probability at Scale {#sec-reliability-foundations-failure-probability}
@@ -188,8 +214,8 @@ $$ \text{MTTF} = \frac{10^9}{\text{FIT}} $$ {#eq-mttf-from-fit}
```{python}
#| label: component-fit-table
#| echo: false
# Format component data for the table
# Goal: Format per-component MTTF in years for @tbl-component-fit.
# Exports: gpu_mttf_yr, hbm_mttf_yr, nic_mttf_yr, psu_mttf_yr, pcie_mttf_yr, cable_mttf_yr, tor_mttf_yr
gpu_mttf_yr = f"{R.gpu_mttf / HOURS_PER_YEAR:.1f}"
hbm_mttf_yr = f"{R.hbm_mttf / HOURS_PER_YEAR:.1f}"
nic_mttf_yr = f"{R.nic_mttf / HOURS_PER_YEAR:.1f}"
@@ -233,24 +259,24 @@ For a cluster of $N$ identical nodes, the same logic applies one level up:
$$ \text{MTBF}_\text{cluster} = \frac{\text{MTBF}_\text{node}}{N} $$ {#eq-mtbf-cluster}
This is the **MTBF cascade**: reliability degrades linearly with component count at each level, and the levels compound. A node with `{python} f"{R.node_mtbf:,.0f}"`-hour MTBF sounds reliable. A cluster of `{python} f"{R.nodes_10k:,}"` such nodes has an MTBF of just `{python} cluster_mtbf_10k_str` hours---a failure every few hours is the expected steady state.
This is the **MTBF cascade**: reliability degrades linearly with component count at each level, and the levels compound. A node with `{python} f"{R.node_mtbf.m_as(ureg.hour):,.0f}"`-hour MTBF sounds reliable. A cluster of `{python} f"{R.nodes_10k:,}"` such nodes has an MTBF of just `{python} cluster_mtbf_10k_str` hours---a failure every few hours is the expected steady state.
@tbl-mtbf-cluster shows how cluster MTBF shrinks as fleet size grows.
```{python}
#| label: mtbf-cluster-table
#| echo: false
# Build MTBF table data
# Goal: Build MTBF row data (hours or minutes, failures/day) for @tbl-mtbf-cluster.
# Exports: mtbf_data list of dicts with "gpus", "nodes", "mtbf", "per_day" keys
mtbf_data = []
for n_gpus in R.cluster_sizes:
n_nodes = R.nodes_for_gpus(n_gpus)
mtbf_h = R.cluster_mtbf(n_gpus)
if mtbf_h >= 1.0:
mtbf_str = f"{mtbf_h:.1f} hours"
mtbf_h_val = R.cluster_mtbf(n_gpus).m_as(ureg.hour) # raw float in hours
if mtbf_h_val >= 1.0:
mtbf_str = f"{mtbf_h_val:.1f} hours"
else:
mtbf_str = f"{mtbf_h * SECONDS_PER_MINUTE:.0f} minutes"
per_day = 24 / mtbf_h
mtbf_str = f"{mtbf_h_val * 60:.0f} minutes"
per_day = 24 / mtbf_h_val
mtbf_data.append({
"gpus": f"{n_gpus:,}",
"nodes": f"{n_nodes:,}",
@@ -292,8 +318,8 @@ When $T_\text{job} \gg \text{MTBF}$, this probability approaches 1 rapidly. @tbl
```{python}
#| label: failure-probability-table
#| echo: false
# Build failure probability matrix
# Goal: Compute P(≥1 failure) matrix for @tbl-failure-prob across cluster sizes and job durations.
# Exports: fp_data dict keyed by n_gpus; values are [1-day, 1-week, 30-day] probability strings
dur_labels = ["1 Day", "1 Week", "30 Days"]
fp_data = {}
for n_gpus in R.cluster_sizes:
@@ -370,6 +396,8 @@ $$ \text{Checkpoint Size} = N_\text{params} \times 16 \text{ bytes/param} $$ {#e
```{python}
#| label: checkpoint-sizing-table
#| echo: false
# Goal: Format checkpoint sizes and write times for @tbl-checkpoint-size across 7B1T models.
# Exports: ckpt_data list of dicts with "label", "ckpt_gb", "write_time" keys
ckpt_data = []
for i, n_params in enumerate(R.model_sizes_params):
@@ -407,28 +435,50 @@ At frontier scale (175B+ parameters), checkpoint sizes reach the terabyte range.
```{python}
#| label: worked-example-young-daly
#| echo: false
# ┌─────────────────────────────────────────────────────────────────────────────
# │ YOUNG-DALY WORKED EXAMPLE
# ├─────────────────────────────────────────────────────────────────────────────
# │ Context: @sec-reliability-foundations-worked-example callout
# │
# │ Goal: Compute optimal checkpoint interval τ_opt for 175B model on 10K-GPU cluster;
# │ show scaling to 20K GPUs.
# │ Show: ~28 min optimal interval, ~X% checkpoint overhead, shorter interval at 20K GPUs.
# │ How: calc_young_daly_interval(δ, MTBF_s) from R.ckpt_write_time_s and R.cluster_mtbf_10k_s.
# │
# │ Imports: mlsys.formulas (calc_young_daly_interval), mlsys.constants (GPUS_PER_HOST)
# │ Exports: yd_mtbf_h_str, yd_delta_str, yd_tau_min_str, yd_overhead_str, tau_20k_min_str
# └─────────────────────────────────────────────────────────────────────────────
# All values already computed in ReliabilityFoundations
yd_mtbf_h = R.cluster_mtbf_10k
yd_mtbf_s = R.cluster_mtbf_10k_s
yd_delta = R.ckpt_write_time_s
yd_tau_s = R.tau_opt_s
yd_tau_min = R.tau_opt_min
class WorkedExampleYoungDaly:
"""Young-Daly optimal checkpoint interval for 175B model on 10K-GPU cluster."""
# All values already computed in ReliabilityFoundations
yd_mtbf_h = R.cluster_mtbf_10k # Quantity[hour]
yd_mtbf_s = R.cluster_mtbf_10k_s # raw float (seconds)
yd_delta = R.ckpt_write_time_s # raw float (seconds)
yd_tau_s = R.tau_opt_s # Quantity[second]
yd_tau_min = R.tau_opt_min # raw float in minutes
# Overhead from checkpointing alone
yd_ckpt_overhead = (yd_delta / yd_tau_s) * 100
# Overhead from checkpointing alone
yd_ckpt_overhead = (yd_delta / yd_tau_s.m_as(ureg.second)) * 100
# What if MTBF halves (20K GPUs)?
mtbf_20k_h = R.node_mtbf / (20_000 // GPUS_PER_HOST)
mtbf_20k_s = mtbf_20k_h * SEC_PER_HOUR
tau_20k_s = calc_young_daly_interval(yd_delta, mtbf_20k_s)
tau_20k_min = tau_20k_s / SECONDS_PER_MINUTE
# What if MTBF halves (20K GPUs)?
mtbf_20k_h = R.node_mtbf / (20_000 // GPUS_PER_HOST) # Quantity[hour]
mtbf_20k_s = mtbf_20k_h.m_as(ureg.second) # raw float (seconds)
tau_20k_s = calc_young_daly_interval(yd_delta, mtbf_20k_s) # Quantity[second]
tau_20k_min = tau_20k_s.m_as(ureg.minute) # raw float in minutes
yd_mtbf_h_str = fmt(yd_mtbf_h, precision=2)
yd_delta_str = fmt(yd_delta, precision=1)
yd_tau_min_str = fmt(yd_tau_min, precision=1)
yd_overhead_str = fmt(yd_ckpt_overhead, precision=1)
tau_20k_min_str = fmt(tau_20k_min, precision=1)
yd_mtbf_h_str = fmt(yd_mtbf_h.m_as(ureg.hour), precision=2)
yd_delta_str = fmt(yd_delta, precision=1)
yd_tau_min_str = fmt(yd_tau_min, precision=1)
yd_overhead_str = fmt(yd_ckpt_overhead, precision=1)
tau_20k_min_str = fmt(tau_20k_min, precision=1)
# ┌── EXPORTS (Bridge to Text) ─────────────────────────────────────────────────
yd_mtbf_h_str = WorkedExampleYoungDaly.yd_mtbf_h_str
yd_delta_str = WorkedExampleYoungDaly.yd_delta_str
yd_tau_min_str = WorkedExampleYoungDaly.yd_tau_min_str
yd_overhead_str = WorkedExampleYoungDaly.yd_overhead_str
tau_20k_min_str = WorkedExampleYoungDaly.tau_20k_min_str
```
::: {.callout-example title="Young-Daly: 175B Model on a 10,000-GPU Cluster"}
@@ -470,12 +520,14 @@ $$ T_\text{recovery} = T_\text{detect} + T_\text{reschedule} + T_\text{reload} +
```{python}
#| label: recovery-anatomy-table
#| echo: false
# Goal: Format recovery phase durations for @tbl-recovery-anatomy.
# Exports: t_detect_str, t_reschedule_str, t_reload_str, t_replay_str, t_total_str
t_detect_str = f"{R.t_detect}"
t_reschedule_str = f"{R.t_reschedule}"
t_reload_str = fmt(R.t_reload_s, precision=1)
t_replay_str = fmt(R.t_replay_s / SECONDS_PER_MINUTE, precision=1)
t_total_str = fmt(R.t_recovery_total_s / SECONDS_PER_MINUTE, precision=1)
t_replay_str = fmt(R.t_replay_s.m_as(ureg.minute), precision=1)
t_total_str = fmt(R.t_recovery_total_s, precision=1)
```
+----------------------------+---------------------------+-------------------------------------------------+
@@ -567,6 +619,8 @@ where $A$ is the availability of a single replica and $k$ is the number of repli
```{python}
#| label: availability-stacking-table
#| echo: false
# Goal: Format availability, nines count, and annual downtime for @tbl-availability-stacking.
# Exports: avail_data list of dicts with "k", "avail", "nines", "downtime" keys
avail_data = []
for k in R.avail_replicas:

View File

@@ -27,7 +27,8 @@ from mlsys.constants import (
CLOUD_EGRESS_PER_GB, USD,
STORAGE_COST_S3_STD, STORAGE_COST_GLACIER,
STORAGE_COST_NVME_LOW, STORAGE_COST_NVME_HIGH,
Mparam, Bparam, TFLOPs, GFLOPs
Mparam, Bparam, TFLOPs, GFLOPs,
watt
)
from mlsys.formatting import fmt, sci, check
@@ -77,13 +78,25 @@ Accelerators can compute faster than storage can feed them. A modern GPU process
# ┌─────────────────────────────────────────────────────────────────────────────
# │ STORAGE HIERARCHY AND MODEL SPECIFICATIONS
# ├─────────────────────────────────────────────────────────────────────────────
# │ Context: Used across the chapter for hierarchy tables and bottleneck analysis.
# │ Context: @sec-data-storage storage hierarchy tables and I/O bottleneck
# │ analysis paragraphs throughout the chapter.
# │
# │ Goal: Provide quantitative specs for hardware and lighthouse models.
# │ Show: The massive gap between HBM bandwidth and disk I/O.
# │ Goal: Establish the six-tier storage hierarchy gap by computing H100 HBM
# │ bandwidth (H100_MEM_BW) vs NVMe sequential bandwidth (NVME_SEQUENTIAL_BW),
# │ and estimate GPT-3 checkpoint write time (GPT3_PARAMS, FP16, at NVMe
# │ vs network storage) to show the I/O bottleneck in fault tolerance.
# │ Show: "3.35" TB/s H100 HBM vs "~7" GB/s NVMe — inline in the storage
# │ hierarchy tier comparison and checkpoint I/O bottleneck paragraphs.
# │ How: Direct .m_as() for each unit conversion; H100_TDP .m_as(watt).
# │
# │ Imports: mlsys.constants
# │ Exports: a100_mem, h100_bw_tbs, gpt3_params_b, resnet_params_m, etc.
# │ Imports: mlsys.constants (A100_MEM_CAPACITY, H100_MEM_CAPACITY, H100_MEM_BW,
# │ H100_FLOPS_FP8_TENSOR, H100_FLOPS_FP16_TENSOR, H100_TDP,
# │ GPT3_PARAMS, RESNET50_PARAMS, NVME_SEQUENTIAL_BW,
# │ NVLINK_H100_BW, PCIE_GEN5_BW, GiB, TB, TFLOPs, GB, second,
# │ watt, Bparam, Mparam)
# │ Exports: a100_mem, h100_mem, h100_bw_tbs, h100_fp8_tflops, h100_fp16_tflops,
# │ h100_tdp_w, gpt3_params_b, resnet_params_m, nvme_bw,
# │ nvlink_bw_gbs, pcie5_bw_gbs
# └─────────────────────────────────────────────────────────────────────────────
import math
@@ -93,21 +106,21 @@ class StorageSetup:
Namespace for global storage constants and specs.
"""
# GPU specs
a100_mem = A100_MEM_CAPACITY.to(GiB).magnitude
h100_mem = H100_MEM_CAPACITY.to(GiB).magnitude
h100_bw = H100_MEM_BW.to(TB/second).magnitude
h100_fp8 = H100_FLOPS_FP8_TENSOR.to(TFLOPs/second).magnitude
h100_fp16 = H100_FLOPS_FP16_TENSOR.to(TFLOPs/second).magnitude
h100_tdp = H100_TDP.magnitude
a100_mem = A100_MEM_CAPACITY.m_as(GiB)
h100_mem = H100_MEM_CAPACITY.m_as(GiB)
h100_bw = H100_MEM_BW.m_as(TB/second)
h100_fp8 = H100_FLOPS_FP8_TENSOR.m_as(TFLOPs/second)
h100_fp16 = H100_FLOPS_FP16_TENSOR.m_as(TFLOPs/second)
h100_tdp = H100_TDP.m_as(watt)
# Model specs
gpt3_params = GPT3_PARAMS.to(Bparam).magnitude
resnet_params = RESNET50_PARAMS.to(Mparam).magnitude
gpt3_params = GPT3_PARAMS.m_as(Bparam)
resnet_params = RESNET50_PARAMS.m_as(Mparam)
# Storage & Interconnect
nvme_bw = NVME_SEQUENTIAL_BW.to(GB/second).magnitude
nvlink_bw = NVLINK_H100_BW.to(GB/second).magnitude
pcie5_bw = PCIE_GEN5_BW.to(GB/second).magnitude
nvme_bw = NVME_SEQUENTIAL_BW.m_as(GB/second)
nvlink_bw = NVLINK_H100_BW.m_as(GB/second)
pcie5_bw = PCIE_GEN5_BW.m_as(GB/second)
# ┌── EXPORTS (Bridge to Text) ─────────────────────────────────────────────────
a100_mem = f"{StorageSetup.a100_mem:.0f}"
@@ -125,11 +138,11 @@ nvlink_bw_gbs = f"{StorageSetup.nvlink_bw:.0f}"
pcie5_bw_gbs = f"{StorageSetup.pcie5_bw:.0f}"
# Storage
nvme_bw = f"{NVME_SEQUENTIAL_BW.to(GB/second).magnitude:.1f}"
nvme_bw = f"{NVME_SEQUENTIAL_BW.m_as(GB/second):.1f}"
# Interconnect
nvlink_bw_gbs = f"{NVLINK_H100_BW.to(GB/second).magnitude:.0f}"
pcie5_bw_gbs = f"{PCIE_GEN5_BW.to(GB/second).magnitude:.0f}"
nvlink_bw_gbs = f"{NVLINK_H100_BW.m_as(GB/second):.0f}"
pcie5_bw_gbs = f"{PCIE_GEN5_BW.m_as(GB/second):.0f}"
# ┌── P.I.C.O. ISOLATED SCENARIO ───────────────────────────────────────────────
class StorageEconomics:

View File

@@ -40,25 +40,66 @@ A single GPU fails perhaps once per year. A thousand GPUs experience failures da
:::
```{python}
#| label: fault-tolerance-setup
#| echo: false
#| label: fault-tolerance-setup
# ┌─────────────────────────────────────────────────────────────────────────────
# │ FAULT TOLERANCE CHAPTER SETUP
# ├─────────────────────────────────────────────────────────────────────────────
# │ Context: Chapter-wide registry — values used in §Young-Daly Law
# │ (@eq-young-daly-applied, line ~1957), §Sharded Checkpointing (line ~2289),
# │ and §Recovery Cost (line ~2365).
# │
# │ Goal: Pre-compute GPT-3 checkpoint size (weights + Adam states) and
# │ per-worker shard size for 1000-worker training, motivating the
# │ checkpoint-interval formula and distributed checkpoint design.
# │ Show: gpt3_ckpt_tb="2.1" TB (full checkpoint),
# │ gpt3_shard_gb="2.1" GB (per-worker shard at 1000 workers) — inline in prose.
# │ How: Multiply GPT3_PARAMS.m_as(param) by bytes-per-param for each state;
# │ convert result pint Quantity with .m_as(TB) and .m_as(GB).
# │
# │ Imports: mlsys.constants (GPT3_PARAMS, param, byte, TB, GB, BILLION),
# │ mlsys.formatting (fmt, sci)
# │ Exports: gpt3_params_b, gpt3_ckpt_tb, gpt3_adam_tb, gpt3_shard_gb
# │ Note: PERSISTENT — gpt3_ckpt_tb used in §Young-Daly (line ~1957),
# │ §Sharded Checkpointing (line ~2289), §Recovery (line ~2365, ~2385);
# │ gpt3_shard_gb used in §Sharded Checkpointing (line ~2289), §Recovery (~2371, ~2385).
# └─────────────────────────────────────────────────────────────────────────────
from mlsys.constants import *
from mlsys.formatting import fmt, sci
# GPT-3 model parameters
gpt3_params_b = f"{GPT3_PARAMS.to(param).magnitude / BILLION:.0f}"
# ┌── P.I.C.O. ISOLATED SCENARIO ───────────────────────────────────────────────
class FaultToleranceSetup:
"""Namespace for GPT-3 checkpoint sizing and shard calculations."""
# GPT-3 checkpoint size: weights (2 bytes FP16) + Adam m (4 bytes) + Adam v (4 bytes) = 12 bytes/param
gpt3_ckpt_bytes = GPT3_PARAMS.magnitude * 12 * byte
gpt3_ckpt_tb = f"{gpt3_ckpt_bytes.to(TB).magnitude:.1f}"
# ┌── 1. PARAMETERS (Inputs) ──────────────────────────────────────────────
# GPT-3 checkpoint byte layout:
# weights (2 bytes FP16) + Adam m (4 bytes) + Adam v (4 bytes) = 12 bytes/param
bytes_full_ckpt = 12 # bytes per param: weights + Adam m + v
bytes_adam_only = 8 # bytes per param: Adam m + v only
n_workers = 1000 # workers for shard size calculation
# GPT-3 Adam optimizer state: m + v = 8 bytes/param
gpt3_adam_bytes = GPT3_PARAMS.magnitude * 8 * byte
gpt3_adam_tb = f"{gpt3_adam_bytes.to(TB).magnitude:.1f}"
# ┌── 2. CALCULATION (The Physics) ────────────────────────────────────────
# Full checkpoint: weights + optimizer states
gpt3_ckpt_bytes = GPT3_PARAMS.m_as(param) * bytes_full_ckpt * byte
# Per-worker shard for 1000 workers
gpt3_shard_gb = f"{gpt3_ckpt_bytes.to(GB).magnitude / 1000:.1f}"
# Optimizer-only checkpoint: Adam m + v (no weights)
gpt3_adam_bytes = GPT3_PARAMS.m_as(param) * bytes_adam_only * byte
# ┌── 3. INVARIANTS (Guardrails) ──────────────────────────────────────────
# No check() calls needed — values are monotone functions of constants.
# ┌── 4. OUTPUTS (Formatting) ─────────────────────────────────────────────
gpt3_params_b = f"{GPT3_PARAMS.m_as(param) / BILLION:.0f}"
gpt3_ckpt_tb = f"{gpt3_ckpt_bytes.m_as(TB):.1f}"
gpt3_adam_tb = f"{gpt3_adam_bytes.m_as(TB):.1f}"
gpt3_shard_gb = f"{gpt3_ckpt_bytes.m_as(GB) / n_workers:.1f}"
# ┌── EXPORTS (Bridge to Text) ─────────────────────────────────────────────────
gpt3_params_b = FaultToleranceSetup.gpt3_params_b
gpt3_ckpt_tb = FaultToleranceSetup.gpt3_ckpt_tb
gpt3_adam_tb = FaultToleranceSetup.gpt3_adam_tb
gpt3_shard_gb = FaultToleranceSetup.gpt3_shard_gb
```
## Failure Analysis at Scale {#sec-fault-tolerance-reliability-reliability-failure-analysis-scale-6b4b}
@@ -2123,45 +2164,88 @@ Imagine 10,000 GPUs, each holding a 10 GB shard of the model state, simultaneous
While @tbl-checkpoint-overhead-by-model suggests modest overhead percentages, real deployments often encounter checkpoint times far exceeding these theoretical estimates. Diagnosing such discrepancies requires examining the full system stack.
```{python}
#| label: checkpoint-debug-calc
#| echo: false
#| label: checkpoint-debug-calc
# ┌─────────────────────────────────────────────────────────────────────────────
# │ CHECKPOINT DEBUG CALCULATION
# ├─────────────────────────────────────────────────────────────────────────────
# │ Context: "Debugging Checkpoint Overhead" callout in §Checkpoint Overhead.
# │
# │ Goal: Diagnose why a 70B model checkpoint takes 10 minutes instead of
# │ 2 minutes on an NFS-backed cluster, by computing theoretical bandwidth
# │ limits and contention-induced effective throughput per node.
# │ Show: total_ckpt_gb_str="420" GB, nfs_gbs_str="1.25" GB/s,
# │ min_write_min_str="5.6" min, per_node_mbs_str="20" MB/s,
# │ serialized_min_str="5,600" min — inline in the Fleet Stack diagnosis.
# │ How: Compute weights + optimizer state size in GB; derive NFS bandwidth in
# │ GB/s (10 Gbps / 8); calculate min write time and per-node bandwidth
# │ under contention from 64 concurrent nodes.
# │
# │ Imports: (none — pure Python arithmetic, no pint quantities)
# │ Exports: weights_gb_str, optimizer_gb_str, total_ckpt_gb_str, nfs_gbs_str,
# │ min_write_s_str, min_write_min_str, per_node_mbs_str, serialized_min_str,
# │ extended_weeks_str, extra_cost_k_str
# └─────────────────────────────────────────────────────────────────────────────
# 70B model checkpoint sizing
model_params_b = 70 # billions
bytes_per_param = 2 # BF16
weights_gb = model_params_b * bytes_per_param # 140 GB
optimizer_gb = weights_gb * 2 # Adam first + second moments
total_ckpt_gb = weights_gb + optimizer_gb # 420 GB
class CheckpointDebugCalc:
"""Diagnose 70B checkpoint overhead on NFS-backed cluster."""
# Storage constraints
nfs_gbps = 10 # Gbps network
nfs_gbs = nfs_gbps / 8 # 1.25 GB/s
min_write_s = total_ckpt_gb / nfs_gbs # seconds
min_write_min = min_write_s / 60 # minutes
# ┌── 1. PARAMETERS (Inputs) ──────────────────────────────────────────────
model_params_b = 70 # 70B parameter model
bytes_per_param = 2 # BF16 weights
nfs_gbps = 10 # NFS network attachment bandwidth in Gbps
n_nodes = 64 # nodes writing simultaneously
overhead_pct = 30 # observed training throughput loss %
base_weeks = 2 # baseline training duration (weeks)
extra_cost_k = 500 # additional cost from extended training ($K)
# Contention analysis
n_nodes = 64
per_node_gbs = nfs_gbs / n_nodes # GB/s per node
per_node_mbs = per_node_gbs * 1000 # MB/s per node
serialized_min = (total_ckpt_gb / per_node_gbs) / 60
# ┌── 2. CALCULATION (The Physics) ────────────────────────────────────────
# Model state sizing
weights_gb = model_params_b * bytes_per_param # 140 GB
optimizer_gb = weights_gb * 2 # Adam m + v moments
total_ckpt_gb = weights_gb + optimizer_gb # 420 GB
# Training extension
overhead_pct = 30
base_weeks = 2
extended_weeks = base_weeks * (1 + overhead_pct / 100)
extra_cost_k = 500 # $K
# Storage bandwidth limits
nfs_gbs = nfs_gbps / 8 # 1.25 GB/s
min_write_s = total_ckpt_gb / nfs_gbs # theoretical minimum seconds
min_write_min = min_write_s / 60 # convert to minutes
# Format strings
weights_gb_str = f"{weights_gb:.0f}"
optimizer_gb_str = f"{optimizer_gb:.0f}"
total_ckpt_gb_str = f"{total_ckpt_gb:.0f}"
nfs_gbs_str = f"{nfs_gbs}"
min_write_s_str = f"{min_write_s:.0f}"
min_write_min_str = f"{min_write_min:.1f}"
per_node_mbs_str = f"{per_node_mbs:.0f}"
serialized_min_str = f"{serialized_min:.0f}"
extended_weeks_str = f"{extended_weeks:.1f}"
extra_cost_k_str = f"{extra_cost_k}"
# Contention: 64 nodes sharing the NFS bandwidth
per_node_gbs = nfs_gbs / n_nodes # GB/s per node under contention
per_node_mbs = per_node_gbs * 1000 # MB/s per node
serialized_min = (total_ckpt_gb / per_node_gbs) / 60 # worst-case serialized write time
# Training schedule impact
extended_weeks = base_weeks * (1 + overhead_pct / 100)
# ┌── 3. INVARIANTS (Guardrails) ──────────────────────────────────────────
assert min_write_min < 10, "Theoretical minimum must be less than observed 10 minutes"
assert serialized_min > min_write_min, "Contention time must exceed theoretical minimum"
# ┌── 4. OUTPUTS (Formatting) ─────────────────────────────────────────────
weights_gb_str = f"{weights_gb:.0f}"
optimizer_gb_str = f"{optimizer_gb:.0f}"
total_ckpt_gb_str = f"{total_ckpt_gb:.0f}"
nfs_gbs_str = f"{nfs_gbs}"
min_write_s_str = f"{min_write_s:.0f}"
min_write_min_str = f"{min_write_min:.1f}"
per_node_mbs_str = f"{per_node_mbs:.0f}"
serialized_min_str = f"{serialized_min:.0f}"
extended_weeks_str = f"{extended_weeks:.1f}"
extra_cost_k_str = f"{extra_cost_k}"
# ┌── EXPORTS (Bridge to Text) ─────────────────────────────────────────────────
weights_gb_str = CheckpointDebugCalc.weights_gb_str
optimizer_gb_str = CheckpointDebugCalc.optimizer_gb_str
total_ckpt_gb_str = CheckpointDebugCalc.total_ckpt_gb_str
nfs_gbs_str = CheckpointDebugCalc.nfs_gbs_str
min_write_s_str = CheckpointDebugCalc.min_write_s_str
min_write_min_str = CheckpointDebugCalc.min_write_min_str
per_node_gbs = CheckpointDebugCalc.per_node_gbs
per_node_mbs_str = CheckpointDebugCalc.per_node_mbs_str
serialized_min_str = CheckpointDebugCalc.serialized_min_str
extended_weeks_str = CheckpointDebugCalc.extended_weeks_str
extra_cost_k_str = CheckpointDebugCalc.extra_cost_k_str
```
::: {.callout-example title="Debugging Checkpoint Overhead"}