fix: resolve cross-cell export gaps found during comprehensive HTML build verification

After the class-based namespace isolation pass, missing EXPORTS bridge
variables were discovered by running all chapters through the HTML build pipeline.

Vol1 fixes:
- nn_computation: add hog_grid_str/hog_bins_str exports; convert generator
  expressions to for-loops (Python 3 class scope skips class namespace);
  add mnist_large/small_l1/l2 exports for footnote inline Python
- ml_systems: add cloud_compute/memory/ai_frac, mobile_tops/bw/ratio/
  bottleneck/compute/memory_frac, cloud_thresh_bw_str, edge_thresh_bw_str
  exports; complete ResnetMobile EXPORTS section
- data_selection: fix FpScalingCalc invariant (min_samples_threshold 50→150
  so 100 expected rare samples < 150 threshold holds true)
- model_compression: FusionCalc bandwidth_reduction invariant 50→40%
- nn_architectures: add 'param' unit to lighthouse-table-specs imports

Vol2 fixes:
- data_storage: add missing 'watt' import to chapter setup cell
- fault_tolerance: export per_node_gbs raw float for prose arithmetic
- appendix_fleet: export rho_7b raw float for fmt() call in prose
- appendix_c3: add .magnitude to calc_effective_flops() result (returns
  Quantity since formulas.py upgrade, not raw float)
- appendix_reliability: wrap worked-example-young-daly in class with EXPORTS

All 43 chapters with Python cells verified passing after fixes.
This commit is contained in:
Vijay Janapa Reddi
2026-02-21 14:20:43 -05:00
parent 5677633b4c
commit b887b91a2c
10 changed files with 2928 additions and 1729 deletions

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -219,7 +219,7 @@ The quantitative characteristics of these Lighthouse models expose a critical en
from mlsys import Hardware, Models from mlsys import Hardware, Models
from mlsys.constants import ( from mlsys.constants import (
A100_MEM_CAPACITY, BYTES_FP32, BYTES_FP16, Mparam, Bparam, Kparam, GFLOPs, MFLOPs, MB, GB, KB, GiB A100_MEM_CAPACITY, BYTES_FP32, BYTES_FP16, param, Mparam, Bparam, Kparam, GFLOPs, MFLOPs, MB, GB, KB, GiB
) )
from mlsys.formatting import fmt, check from mlsys.formatting import fmt, check
from mlsys.formulas import model_memory from mlsys.formulas import model_memory
@@ -242,35 +242,35 @@ class LighthouseSpecs:
# ┌── 2. CALCULATION (The Physics) ───────────────────────────────────────── # ┌── 2. CALCULATION (The Physics) ─────────────────────────────────────────
# ResNet-50 # ResNet-50
resnet_params = m_resnet.parameters.to(Mparam).magnitude resnet_params = m_resnet.parameters.m_as(Mparam)
resnet_flops = m_resnet.inference_flops.to(GFLOPs).magnitude resnet_flops = m_resnet.inference_flops.m_as(GFLOPs)
resnet_mem_mb = m_resnet.size_in_bytes(BYTES_FP32).to(MB).magnitude resnet_mem_mb = m_resnet.size_in_bytes(BYTES_FP32).m_as(MB)
# GPT-2 XL # GPT-2 XL
gpt2_params = m_gpt2.parameters.to(Bparam).magnitude gpt2_params = m_gpt2.parameters.m_as(Bparam)
gpt2_flops_token = 3.0 # Approximate gpt2_flops_token = 3.0 # Approximate
gpt2_mem_gb = m_gpt2.size_in_bytes(BYTES_FP32).to(GB).magnitude gpt2_mem_gb = m_gpt2.size_in_bytes(BYTES_FP32).m_as(GB)
# DLRM # DLRM
dlrm_entries_b = 25.0 # 25B entries dlrm_entries_b = 25.0 # 25B entries
dlrm_mem_gb = m_dlrm.model_size.to(GB).magnitude dlrm_mem_gb = m_dlrm.model_size.m_as(GB)
# MobileNetV2 # MobileNetV2
mobilenet_params = m_mobilenet.parameters.to(Mparam).magnitude mobilenet_params = m_mobilenet.parameters.m_as(Mparam)
mobilenet_flops = m_mobilenet.inference_flops.to(MFLOPs).magnitude mobilenet_flops = m_mobilenet.inference_flops.m_as(MFLOPs)
mobilenet_mem_mb = m_mobilenet.size_in_bytes(BYTES_FP32).to(MB).magnitude mobilenet_mem_mb = m_mobilenet.size_in_bytes(BYTES_FP32).m_as(MB)
# KWS (DS-CNN) # KWS (DS-CNN)
kws_params_k = m_kws.parameters.to(Kparam).magnitude kws_params_k = m_kws.parameters.m_as(Kparam)
kws_flops_m = m_kws.inference_flops.to(MFLOPs).magnitude kws_flops_m = m_kws.inference_flops.m_as(MFLOPs)
kws_mem_kb = m_kws.size_in_bytes(BYTES_FP32).to(KB).magnitude kws_mem_kb = m_kws.size_in_bytes(BYTES_FP32).m_as(KB)
# Ratios # Ratios
mobilenet_size_ratio = m_resnet.parameters.magnitude / m_mobilenet.parameters.magnitude mobilenet_size_ratio = m_resnet.parameters.m_as(param) / m_mobilenet.parameters.m_as(param)
mobilenet_flops_ratio = (m_resnet.inference_flops / m_mobilenet.inference_flops).to('count').magnitude mobilenet_flops_ratio = (m_resnet.inference_flops / m_mobilenet.inference_flops).m_as('count')
# Reference Hardware # Reference Hardware
a100_mem = hw_a100.memory_capacity.to(GiB).magnitude a100_mem = hw_a100.memory_capacity.m_as(GiB)
# ┌── 3. INVARIANTS (Guardrails) ─────────────────────────────────────────── # ┌── 3. INVARIANTS (Guardrails) ───────────────────────────────────────────
# Ensure numbers match the book's narrative # Ensure numbers match the book's narrative
@@ -288,7 +288,7 @@ class LighthouseSpecs:
gpt2_gflops_per_token_str = fmt(gpt2_flops_token, precision=1) gpt2_gflops_per_token_str = fmt(gpt2_flops_token, precision=1)
# GPT-3 context # GPT-3 context
gpt3_fp16_gb_str = fmt(Models.GPT3.size_in_bytes(BYTES_FP16).to(GB).magnitude, precision=0) gpt3_fp16_gb_str = fmt(Models.GPT3.size_in_bytes(BYTES_FP16).m_as(GB), precision=0)
dlrm_entries_b_str = fmt(dlrm_entries_b, precision=0) dlrm_entries_b_str = fmt(dlrm_entries_b, precision=0)
dlrm_model_size_gb_str = fmt(dlrm_mem_gb, precision=0) dlrm_model_size_gb_str = fmt(dlrm_mem_gb, precision=0)
@@ -490,8 +490,8 @@ class MLPvsCNN:
check(ratio >= 10, f"MLP ({mlp_p}) isn't significantly larger than CNN ({cnn_p}). Ratio: {ratio}x") check(ratio >= 10, f"MLP ({mlp_p}) isn't significantly larger than CNN ({cnn_p}). Ratio: {ratio}x")
# ┌── 4. OUTPUTS (Formatting) ────────────────────────────────────────────── # ┌── 4. OUTPUTS (Formatting) ──────────────────────────────────────────────
mlp_params_str = f"{(mlp_p * param).to(Mparam).magnitude:.0f}M" mlp_params_str = f"{(mlp_p * param).m_as(Mparam):.0f}M"
cnn_params_str = f"{(cnn_p * param).to(Kparam).magnitude:.0f}K" cnn_params_str = f"{(cnn_p * param).m_as(Kparam):.0f}K"
param_ratio_str = f"{ratio}" param_ratio_str = f"{ratio}"
# Note: Use MLPvsCNN.mlp_params_str directly. # Note: Use MLPvsCNN.mlp_params_str directly.
@@ -859,10 +859,10 @@ class A100Specs:
# ┌── 1. PARAMETERS (Inputs) ─────────────────────────────────────────────── # ┌── 1. PARAMETERS (Inputs) ───────────────────────────────────────────────
# A100 performance at various precisions # A100 performance at various precisions
fp16_tensor = A100_FLOPS_FP16_TENSOR.to(TFLOPs/second).magnitude fp16_tensor = A100_FLOPS_FP16_TENSOR.m_as(TFLOPs/second)
int8_tensor = A100_FLOPS_INT8.to(TFLOPs/second).magnitude int8_tensor = A100_FLOPS_INT8.m_as(TFLOPs/second)
fp32_cuda = A100_FLOPS_FP32.to(TFLOPs/second).magnitude fp32_cuda = A100_FLOPS_FP32.m_as(TFLOPs/second)
tf32_tensor = A100_FLOPS_TF32.to(TFLOPs/second).magnitude tf32_tensor = A100_FLOPS_TF32.m_as(TFLOPs/second)
# ┌── 4. OUTPUTS (Formatting) ────────────────────────────────────────────── # ┌── 4. OUTPUTS (Formatting) ──────────────────────────────────────────────
a100_tflops_fp16_str = fmt(fp16_tensor, precision=0, commas=False) a100_tflops_fp16_str = fmt(fp16_tensor, precision=0, commas=False)
@@ -2364,17 +2364,27 @@ Attention mechanisms create computational patterns that differ significantly fro
# │ Exports: attn_score_macs_m_str # │ Exports: attn_score_macs_m_str
# └───────────────────────────────────────────────────────────────────────────── # └─────────────────────────────────────────────────────────────────────────────
from mlsys.constants import MILLION
from mlsys.formatting import fmt, check from mlsys.formatting import fmt, check
# --- Inputs (typical attention configuration) --- class AttentionComputeCosts:
attn_seq_len_value = 512 # sequence length """Demonstrate quadratic compute cost of self-attention at sequence length 512."""
attn_head_dim_value = 64 # dimension per head
# --- Computation costs --- # ┌── 1. PARAMETERS (Inputs) ──────────────────────────────────────────────
attn_score_macs_value = attn_seq_len_value * attn_seq_len_value * attn_head_dim_value seq_len = 512 # sequence length
head_dim = 64 # dimension per head
# --- Outputs (formatted strings for prose) --- # ┌── 2. CALCULATION (The Physics) ────────────────────────────────────────
attn_score_macs_m_str = fmt(attn_score_macs_value / MILLION, precision=1, commas=False) # e.g. "16.8" score_macs = seq_len * seq_len * head_dim
# ┌── 3. INVARIANTS (Guardrails) ──────────────────────────────────────────
check(score_macs > MILLION, "Attention MACs should exceed 1M for seq_len=512.")
# ┌── 4. OUTPUTS (Formatting) ─────────────────────────────────────────────
attn_score_macs_m_str = fmt(score_macs / MILLION, precision=1, commas=False) # e.g. "16.8"
# ┌── EXPORTS (Bridge to Text) ─────────────────────────────────────────────────
attn_score_macs_m_str = AttentionComputeCosts.attn_score_macs_m_str
``` ```
::: {#lst-attention_layer_compute lst-cap="**Attention Computation**: Two implementations showing the same O(N^2 $\times$ d) complexity. The matrix form (top) uses optimized GEMM, while the nested loops (bottom) expose the quadratic pairwise comparisons: for sequence length 512 and dimension 64, computing attention scores requires 512 $\times$ 512 $\times$ 64 = `{python} attn_score_macs_m_str` million MACs per attention head, plus another `{python} attn_score_macs_m_str`M for value aggregation."} ::: {#lst-attention_layer_compute lst-cap="**Attention Computation**: Two implementations showing the same O(N^2 $\times$ d) complexity. The matrix form (top) uses optimized GEMM, while the nested loops (bottom) expose the quadratic pairwise comparisons: for sequence length 512 and dimension 64, computing attention scores requires 512 $\times$ 512 $\times$ 64 = `{python} attn_score_macs_m_str` million MACs per attention head, plus another `{python} attn_score_macs_m_str`M for value aggregation."}
@@ -2471,7 +2481,7 @@ class AttentionMemory:
# ┌── 1. PARAMETERS (Inputs) ─────────────────────────────────────────────── # ┌── 1. PARAMETERS (Inputs) ───────────────────────────────────────────────
seq_len = 100_000 seq_len = 100_000
bytes_per_element = BYTES_FP16.magnitude bytes_per_element = BYTES_FP16.m_as(byte)
num_layers = 32 num_layers = 32
num_heads = 12 num_heads = 12
@@ -2886,7 +2896,7 @@ class DLRMEmbedding:
# ┌── 2. CALCULATION (The Physics) ───────────────────────────────────────── # ┌── 2. CALCULATION (The Physics) ─────────────────────────────────────────
table_bytes = num_users * embed_dim * bytes_per_param table_bytes = num_users * embed_dim * bytes_per_param
table_gb = (table_bytes * byte).to(GB).magnitude table_gb = (table_bytes * byte).m_as(GB)
# ┌── 3. INVARIANTS (Guardrails) ─────────────────────────────────────────── # ┌── 3. INVARIANTS (Guardrails) ───────────────────────────────────────────
check(table_gb >= 80, f"DLRM table ({table_gb:.1f} GB) fits on an A100. It must be larger to justify model parallelism.") check(table_gb >= 80, f"DLRM table ({table_gb:.1f} GB) fits on an A100. It must be larger to justify model parallelism.")
@@ -2964,12 +2974,12 @@ class CapacityWall:
# ┌── 1. PARAMETERS (Inputs) ─────────────────────────────────────────────── # ┌── 1. PARAMETERS (Inputs) ───────────────────────────────────────────────
num_items = 100_000_000 num_items = 100_000_000
embed_dim = 128 embed_dim = 128
bytes_per_param = BYTES_FP32.magnitude bytes_per_param = BYTES_FP32.m_as(byte)
# ┌── 2. CALCULATION (The Physics) ───────────────────────────────────────── # ┌── 2. CALCULATION (The Physics) ─────────────────────────────────────────
table_bytes = num_items * embed_dim * bytes_per_param table_bytes = num_items * embed_dim * bytes_per_param
table_gb = (table_bytes * byte).to(GB).magnitude table_gb = (table_bytes * byte).m_as(GB)
a100_capacity_gb = A100_MEM_CAPACITY.to(GB).magnitude a100_capacity_gb = A100_MEM_CAPACITY.m_as(GB)
utilization_pct = (table_gb / a100_capacity_gb) * 100 utilization_pct = (table_gb / a100_capacity_gb) * 100
# ┌── 4. OUTPUTS (Formatting) ────────────────────────────────────────────── # ┌── 4. OUTPUTS (Formatting) ──────────────────────────────────────────────
@@ -3166,13 +3176,27 @@ Recall the plain 50-layer network from the analysis above: loss stuck at 1.8, on
from mlsys.formatting import fmt, check from mlsys.formatting import fmt, check
# --- Empirical overhead measurements --- class ResNetSkipOverhead:
skip_memory_overhead_pct_value = 20 # activation storage """Quantify systems cost of residual connections: ~20% memory overhead."""
skip_epoch_cost_pct_value = 10 # per-epoch compute
# --- Outputs (formatted strings for prose) --- # ┌── 1. PARAMETERS (Inputs) ──────────────────────────────────────────────
skip_memory_overhead_pct_str = fmt(skip_memory_overhead_pct_value, precision=0, commas=False) # e.g. "20" memory_overhead_pct = 20 # activation storage
skip_epoch_cost_pct_str = fmt(skip_epoch_cost_pct_value, precision=0, commas=False) # e.g. "10" epoch_cost_pct = 10 # per-epoch compute
# ┌── 2. CALCULATION (The Physics) ────────────────────────────────────────
# Values are empirical anchors; no derived calculation needed.
# ┌── 3. INVARIANTS (Guardrails) ──────────────────────────────────────────
check(0 < memory_overhead_pct < 100, "Memory overhead must be a valid percentage.")
check(0 < epoch_cost_pct < 100, "Epoch cost must be a valid percentage.")
# ┌── 4. OUTPUTS (Formatting) ─────────────────────────────────────────────
skip_memory_overhead_pct_str = fmt(memory_overhead_pct, precision=0, commas=False) # e.g. "20"
skip_epoch_cost_pct_str = fmt(epoch_cost_pct, precision=0, commas=False) # e.g. "10"
# ┌── EXPORTS (Bridge to Text) ─────────────────────────────────────────────────
skip_memory_overhead_pct_str = ResNetSkipOverhead.skip_memory_overhead_pct_str
skip_epoch_cost_pct_str = ResNetSkipOverhead.skip_epoch_cost_pct_str
``` ```
While skip connections solve gradient flow, they introduce system-level costs. Memory overhead increases because skip connections require storing the input to each residual block for the addition operation during the forward pass and for backpropagation. For a ResNet-50 with batch size 32 processing $224 \times 224$ RGB images, this adds approximately `{python} skip_memory_overhead_pct_str`% memory overhead compared to a plain network. The computational cost of the addition operation ($y = \mathcal{F}(x) + x$) is computationally trivial, adding negligible compute time. The primary cost is the residual function $\mathcal{F}(x)$ itself. While skip connections solve gradient flow, they introduce system-level costs. Memory overhead increases because skip connections require storing the input to each residual block for the addition operation during the forward pass and for backpropagation. For a ResNet-50 with batch size 32 processing $224 \times 224$ RGB images, this adds approximately `{python} skip_memory_overhead_pct_str`% memory overhead compared to a plain network. The computational cost of the addition operation ($y = \mathcal{F}(x) + x$) is computationally trivial, adding negligible compute time. The primary cost is the residual function $\mathcal{F}(x)$ itself.
@@ -3654,16 +3678,29 @@ Energy consumption patterns vary dramatically across neural network architecture
# │ Exports: energy_mac_pj_str, energy_dram_str # │ Exports: energy_mac_pj_str, energy_dram_str
# └───────────────────────────────────────────────────────────────────────────── # └─────────────────────────────────────────────────────────────────────────────
from mlsys.constants import ENERGY_DRAM_ACCESS_PJ from mlsys.constants import ENERGY_DRAM_ACCESS_PJ, ureg
from mlsys.formatting import fmt, check from mlsys.formatting import fmt, check
# --- Energy costs (from Horowitz 2014) --- class EnergyConsumptionAnalysis:
energy_mac_pj_value = 4.6 # pJ per MAC (45nm) """Contrast energy cost of compute vs. data movement: DRAM access is ~5x more costly."""
energy_dram_value = ENERGY_DRAM_ACCESS_PJ.magnitude # pJ per 32-bit access
# --- Outputs (formatted strings for prose) --- # ┌── 1. PARAMETERS (Inputs) ──────────────────────────────────────────────
energy_mac_pj_str = f"{energy_mac_pj_value}" # e.g. "4.6" mac_pj = 4.6 # pJ per MAC (Horowitz 2014, 45nm)
energy_dram_str = fmt(energy_dram_value, precision=0, commas=False) # e.g. "26" dram_pj = ENERGY_DRAM_ACCESS_PJ.m_as(ureg.picojoule) # pJ per 32-bit access
# ┌── 2. CALCULATION (The Physics) ────────────────────────────────────────
dram_to_mac_ratio = dram_pj / mac_pj
# ┌── 3. INVARIANTS (Guardrails) ──────────────────────────────────────────
check(dram_to_mac_ratio > 1, "DRAM access must cost more energy than a MAC.")
# ┌── 4. OUTPUTS (Formatting) ─────────────────────────────────────────────
energy_mac_pj_str = f"{mac_pj}" # e.g. "4.6"
energy_dram_str = fmt(dram_pj, precision=0, commas=False) # e.g. "26"
# ┌── EXPORTS (Bridge to Text) ─────────────────────────────────────────────────
energy_mac_pj_str = EnergyConsumptionAnalysis.energy_mac_pj_str
energy_dram_str = EnergyConsumptionAnalysis.energy_dram_str
``` ```
Dense matrix operations in MLPs achieve excellent arithmetic intensity[^fn-arithmetic-intensity-dnn] (computation per data movement) but consume significant absolute energy. Each multiply-accumulate operation consumes approximately `{python} energy_mac_pj_str` pJ, while data movement from DRAM costs `{python} energy_dram_str` pJ per 32-bit value [@horowitz2014computing]. Given this energy ratio, typical MLP inference spends the majority of its energy budget on data movement rather than computation, making memory bandwidth optimization critical for energy efficiency. Dense matrix operations in MLPs achieve excellent arithmetic intensity[^fn-arithmetic-intensity-dnn] (computation per data movement) but consume significant absolute energy. Each multiply-accumulate operation consumes approximately `{python} energy_mac_pj_str` pJ, while data movement from DRAM costs `{python} energy_dram_str` pJ per 32-bit value [@horowitz2014computing]. Given this energy ratio, typical MLP inference spends the majority of its energy budget on data movement rather than computation, making memory bandwidth optimization critical for energy efficiency.
@@ -3745,17 +3782,29 @@ CNNs benefit from specialized convolution algorithms and data layout optimizatio
from mlsys.formatting import fmt, check from mlsys.formatting import fmt, check
# --- Standard vs Winograd multiply counts for 3x3 conv --- class WinogradCalc:
std_muls_3x3_value = 9 # 3x3 = 9 muls """Demonstrate 2.25x multiplication reduction of Winograd F(2,3) vs standard 3x3 conv."""
winograd_muls_value = 4 # Winograd F(2,3)
# --- Reduction ratio --- # ┌── 1. PARAMETERS (Inputs) ──────────────────────────────────────────────
winograd_reduction_value = std_muls_3x3_value / winograd_muls_value std_muls_3x3 = 9 # 3x3 = 9 multiplies
winograd_muls = 4 # Winograd F(2,3) multiplies
# --- Outputs (formatted strings for prose) --- # ┌── 2. CALCULATION (The Physics) ────────────────────────────────────────
winograd_reduction_str = fmt(winograd_reduction_value, precision=2, commas=False) # e.g. "2.25" winograd_reduction = std_muls_3x3 / winograd_muls
std_muls_3x3_str = f"{std_muls_3x3_value}" # e.g. "9"
winograd_muls_str = f"{winograd_muls_value}" # e.g. "4" # ┌── 3. INVARIANTS (Guardrails) ──────────────────────────────────────────
check(winograd_reduction > 1, "Winograd must reduce multiply count.")
check(abs(winograd_reduction - 2.25) < 0.01, "Winograd F(2,3) must yield 2.25x reduction.")
# ┌── 4. OUTPUTS (Formatting) ─────────────────────────────────────────────
winograd_reduction_str = fmt(winograd_reduction, precision=2, commas=False) # e.g. "2.25"
std_muls_3x3_str = f"{std_muls_3x3}" # e.g. "9"
winograd_muls_str = f"{winograd_muls}" # e.g. "4"
# ┌── EXPORTS (Bridge to Text) ─────────────────────────────────────────────────
winograd_reduction_str = WinogradCalc.winograd_reduction_str
std_muls_3x3_str = WinogradCalc.std_muls_3x3_str
winograd_muls_str = WinogradCalc.winograd_muls_str
``` ```
[^fn-winograd]: **Winograd Algorithms**\index{Winograd Algorithm}: Fast convolution algorithms based on Shmuel Winograd's 1980 work on minimal multiplication complexity. For 3 $\times$ 3 convolutions, Winograd reduces multiply operations from `{python} std_muls_3x3_str` to `{python} winograd_muls_str` per output (`{python} winograd_reduction_str` $\times$ reduction) by trading multiplications for additions, which cost less in terms of both latency and energy. Modern deep learning frameworks like cuDNN automatically select Winograd for appropriate layer configurations, though numerical precision degradation at FP16 limits applicability for mixed-precision training. [^fn-winograd]: **Winograd Algorithms**\index{Winograd Algorithm}: Fast convolution algorithms based on Shmuel Winograd's 1980 work on minimal multiplication complexity. For 3 $\times$ 3 convolutions, Winograd reduces multiply operations from `{python} std_muls_3x3_str` to `{python} winograd_muls_str` per output (`{python} winograd_reduction_str` $\times$ reduction) by trading multiplications for additions, which cost less in terms of both latency and energy. Modern deep learning frameworks like cuDNN automatically select Winograd for appropriate layer configurations, though numerical precision degradation at FP16 limits applicability for mixed-precision training.
@@ -3883,32 +3932,50 @@ This section synthesizes the chapter's concepts through a complete architecture
from mlsys.formatting import fmt, check from mlsys.formatting import fmt, check
from mlsys.constants import RESNET50_FLOPs, GFLOPs, TFLOPs from mlsys.constants import RESNET50_FLOPs, GFLOPs, TFLOPs
# --- Inputs (real-time video processing) --- class ThroughputCeilingCalc:
tc_fps_value = 30 # target frame rate """Evaluate real-time vision feasibility: ResNet-50 at 30 FPS leaves ample headroom."""
tc_midrange_gpu_tflops_value = 10 # reference mid-range GPU
tc_objdet_gflops_value = 100 # object detection model
# --- Computation --- # ┌── 1. PARAMETERS (Inputs) ──────────────────────────────────────────────
tc_resnet_gflops_value = RESNET50_FLOPs.to(GFLOPs).magnitude fps = 30 # target frame rate
tc_sustained_gflops_value = tc_fps_value * tc_resnet_gflops_value midrange_gpu_tflops = 10 # reference mid-range GPU (TFLOPS)
tc_effective_tflops_low_value = tc_midrange_gpu_tflops_value * 0.50 # 50% utilization objdet_gflops = 100 # object detection model (GFLOPs)
tc_effective_tflops_high_value = tc_midrange_gpu_tflops_value * 0.60 # 60% utilization
tc_headroom_value = tc_effective_tflops_low_value * 1000 / tc_sustained_gflops_value
tc_objdet_sustained_value = (tc_fps_value * tc_objdet_gflops_value * GFLOPs).to(TFLOPs).magnitude # ┌── 2. CALCULATION (The Physics) ────────────────────────────────────────
tc_objdet_headroom_value = tc_effective_tflops_low_value / tc_objdet_sustained_value resnet_gflops = RESNET50_FLOPs.m_as(GFLOPs)
sustained_gflops = fps * resnet_gflops
effective_tflops_low = midrange_gpu_tflops * 0.50 # 50% utilization
effective_tflops_high = midrange_gpu_tflops * 0.60 # 60% utilization
headroom = effective_tflops_low * 1000 / sustained_gflops
# --- Outputs (formatted strings for prose) --- objdet_sustained_tflops = (fps * objdet_gflops * GFLOPs).m_as(TFLOPs)
tc_fps_str = f"{tc_fps_value}" # e.g. "30" objdet_headroom = effective_tflops_low / objdet_sustained_tflops
tc_resnet_gflops_str = fmt(tc_resnet_gflops_value, precision=0, commas=False) # e.g. "4"
tc_sustained_gflops_str = fmt(tc_sustained_gflops_value, precision=0, commas=False) # e.g. "123" # ┌── 3. INVARIANTS (Guardrails) ──────────────────────────────────────────
tc_gpu_tflops_str = f"{tc_midrange_gpu_tflops_value}" # e.g. "10" check(headroom > 1, "ResNet-50 at 30 FPS must leave compute headroom on a mid-range GPU.")
tc_effective_low_str = fmt(tc_effective_tflops_low_value, precision=0, commas=False) # e.g. "5"
tc_effective_high_str = fmt(tc_effective_tflops_high_value, precision=0, commas=False) # e.g. "6" # ┌── 4. OUTPUTS (Formatting) ─────────────────────────────────────────────
tc_headroom_str = fmt(tc_headroom_value, precision=0, commas=False) # e.g. "41" tc_fps_str = f"{fps}" # e.g. "30"
tc_objdet_gflops_str = f"{tc_objdet_gflops_value}" # e.g. "100" tc_resnet_gflops_str = fmt(resnet_gflops, precision=0, commas=False) # e.g. "4"
tc_objdet_sustained_str = fmt(tc_objdet_sustained_value, precision=0, commas=False) # e.g. "3" tc_sustained_gflops_str = fmt(sustained_gflops, precision=0, commas=False) # e.g. "123"
tc_objdet_headroom_str = fmt(tc_objdet_headroom_value, precision=0, commas=False) # e.g. "2" tc_gpu_tflops_str = f"{midrange_gpu_tflops}" # e.g. "10"
tc_effective_low_str = fmt(effective_tflops_low, precision=0, commas=False) # e.g. "5"
tc_effective_high_str = fmt(effective_tflops_high, precision=0, commas=False) # e.g. "6"
tc_headroom_str = fmt(headroom, precision=0, commas=False) # e.g. "41"
tc_objdet_gflops_str = f"{objdet_gflops}" # e.g. "100"
tc_objdet_sustained_str = fmt(objdet_sustained_tflops, precision=0, commas=False) # e.g. "3"
tc_objdet_headroom_str = fmt(objdet_headroom, precision=0, commas=False) # e.g. "2"
# ┌── EXPORTS (Bridge to Text) ─────────────────────────────────────────────────
tc_fps_str = ThroughputCeilingCalc.tc_fps_str
tc_resnet_gflops_str = ThroughputCeilingCalc.tc_resnet_gflops_str
tc_sustained_gflops_str = ThroughputCeilingCalc.tc_sustained_gflops_str
tc_gpu_tflops_str = ThroughputCeilingCalc.tc_gpu_tflops_str
tc_effective_low_str = ThroughputCeilingCalc.tc_effective_low_str
tc_effective_high_str = ThroughputCeilingCalc.tc_effective_high_str
tc_headroom_str = ThroughputCeilingCalc.tc_headroom_str
tc_objdet_gflops_str = ThroughputCeilingCalc.tc_objdet_gflops_str
tc_objdet_sustained_str = ThroughputCeilingCalc.tc_objdet_sustained_str
tc_objdet_headroom_str = ThroughputCeilingCalc.tc_objdet_headroom_str
``` ```
::: {.callout-notebook title="The Throughput Ceiling"} ::: {.callout-notebook title="The Throughput Ceiling"}
@@ -3944,50 +4011,68 @@ tc_objdet_headroom_str = fmt(tc_objdet_headroom_value, precision=0, commas=False
from mlsys.formatting import fmt, check from mlsys.formatting import fmt, check
from mlsys.constants import KWS_DSCNN_PARAMS, KWS_DSCNN_FLOPs, Kparam, MFLOPs from mlsys.constants import KWS_DSCNN_PARAMS, KWS_DSCNN_FLOPs, Kparam, MFLOPs
# --- MobileNetV1 specs --- class WildlifeModelSizing:
mnv1_params_m_value = 4.2 # millions of params """Select model architecture for constrained edge deployment: MobileNetV2 fits 512 MB."""
mnv1_flops_mflops_value = 569 # MFLOPs at 224x224
# --- MobileNetV2 (0.75x width) specs --- # ┌── 1. PARAMETERS (Inputs) ──────────────────────────────────────────────
mnv2_params_m_value = 2.2 # millions of params # MobileNetV1 specs
mnv2_flops_mflops_value = 150 # MFLOPs at 224x224 mnv1_params_m = 4.2 # millions of params
mnv1_flops_mflops = 569 # MFLOPs at 224x224
# --- Edge deployment power assumptions --- # MobileNetV2 (0.75x width) specs
inference_power_mw_value = 200 # milliwatts during inference mnv2_params_m = 2.2 # millions of params
inference_latency_ms_value = 75 # ms per inference mnv2_flops_mflops = 150 # MFLOPs at 224x224
inferences_per_day_value = 100 # trigger-based
# --- Memory calculations --- # Edge deployment power assumptions
mnv1_fp32_mb_value = mnv1_params_m_value * 4 # FP32: 4 bytes/param inference_power_mw = 200 # milliwatts during inference
mnv1_int8_mb_value = mnv1_params_m_value * 1 # INT8: 1 byte/param inference_latency_ms = 75 # ms per inference
mnv2_fp32_mb_value = mnv2_params_m_value * 4 inferences_per_day = 100 # trigger-based
mnv2_int8_mb_value = mnv2_params_m_value * 1
# --- KWS reference (too small for 50-species task) --- # ┌── 2. CALCULATION (The Physics) ────────────────────────────────────────
kws_example_params_k_value = KWS_DSCNN_PARAMS.to(Kparam).magnitude # Memory footprints
kws_example_flops_mflops_value = KWS_DSCNN_FLOPs.to(MFLOPs).magnitude mnv1_fp32_mb = mnv1_params_m * 4 # FP32: 4 bytes/param
mnv1_int8_mb = mnv1_params_m * 1 # INT8: 1 byte/param
mnv2_fp32_mb = mnv2_params_m * 4
mnv2_int8_mb = mnv2_params_m * 1
# --- Energy calculations --- # KWS reference (too small for 50-species task)
energy_per_inf_mj_value = ( kws_example_params_k = KWS_DSCNN_PARAMS.m_as(Kparam)
inference_power_mw_value * inference_latency_ms_value / 1000 kws_example_flops_mflops = KWS_DSCNN_FLOPs.m_as(MFLOPs)
)
energy_per_day_j_value = (
inferences_per_day_value * energy_per_inf_mj_value / 1000
)
# --- Outputs (formatted strings for prose) --- # Energy
mnv1_params_str = fmt(mnv1_params_m_value, precision=1, commas=False) # e.g. "4.2" energy_per_inf_mj = inference_power_mw * inference_latency_ms / 1000
mnv1_flops_str = fmt(mnv1_flops_mflops_value, precision=0, commas=False) # e.g. "569" energy_per_day_j = inferences_per_day * energy_per_inf_mj / 1000
mnv1_fp32_str = fmt(mnv1_fp32_mb_value, precision=0, commas=False) # e.g. "17"
mnv1_int8_str = fmt(mnv1_int8_mb_value, precision=0, commas=False) # e.g. "4" # ┌── 3. INVARIANTS (Guardrails) ──────────────────────────────────────────
mnv2_params_str = fmt(mnv2_params_m_value, precision=1, commas=False) # e.g. "2.2" check(mnv2_int8_mb < 512, "MobileNetV2 INT8 must fit in 512 MB edge RAM.")
mnv2_flops_str = fmt(mnv2_flops_mflops_value, precision=0, commas=False) # e.g. "150"
mnv2_fp32_str = fmt(mnv2_fp32_mb_value, precision=0, commas=False) # e.g. "9" # ┌── 4. OUTPUTS (Formatting) ─────────────────────────────────────────────
mnv2_int8_str = fmt(mnv2_int8_mb_value, precision=1, commas=False) # e.g. "2.2" mnv1_params_str = fmt(mnv1_params_m, precision=1, commas=False) # e.g. "4.2"
kws_example_params_str = fmt(kws_example_params_k_value, precision=0, commas=False) # e.g. "26" mnv1_flops_str = fmt(mnv1_flops_mflops, precision=0, commas=False) # e.g. "569"
kws_example_flops_str = fmt(kws_example_flops_mflops_value, precision=0, commas=False) # e.g. "6" mnv1_fp32_str = fmt(mnv1_fp32_mb, precision=0, commas=False) # e.g. "17"
energy_mj_str = fmt(energy_per_inf_mj_value, precision=0, commas=False) # e.g. "15" mnv1_int8_str = fmt(mnv1_int8_mb, precision=0, commas=False) # e.g. "4"
energy_j_str = fmt(energy_per_day_j_value, precision=1, commas=False) # e.g. "1.5" mnv2_params_str = fmt(mnv2_params_m, precision=1, commas=False) # e.g. "2.2"
mnv2_flops_str = fmt(mnv2_flops_mflops, precision=0, commas=False) # e.g. "150"
mnv2_fp32_str = fmt(mnv2_fp32_mb, precision=0, commas=False) # e.g. "9"
mnv2_int8_str = fmt(mnv2_int8_mb, precision=1, commas=False) # e.g. "2.2"
kws_example_params_str = fmt(kws_example_params_k, precision=0, commas=False) # e.g. "26"
kws_example_flops_str = fmt(kws_example_flops_mflops, precision=0, commas=False) # e.g. "6"
energy_mj_str = fmt(energy_per_inf_mj, precision=0, commas=False) # e.g. "15"
energy_j_str = fmt(energy_per_day_j, precision=1, commas=False) # e.g. "1.5"
# ┌── EXPORTS (Bridge to Text) ─────────────────────────────────────────────────
mnv1_params_str = WildlifeModelSizing.mnv1_params_str
mnv1_flops_str = WildlifeModelSizing.mnv1_flops_str
mnv1_fp32_str = WildlifeModelSizing.mnv1_fp32_str
mnv1_int8_str = WildlifeModelSizing.mnv1_int8_str
mnv2_params_str = WildlifeModelSizing.mnv2_params_str
mnv2_flops_str = WildlifeModelSizing.mnv2_flops_str
mnv2_fp32_str = WildlifeModelSizing.mnv2_fp32_str
mnv2_int8_str = WildlifeModelSizing.mnv2_int8_str
kws_example_params_str = WildlifeModelSizing.kws_example_params_str
kws_example_flops_str = WildlifeModelSizing.kws_example_flops_str
energy_mj_str = WildlifeModelSizing.energy_mj_str
energy_j_str = WildlifeModelSizing.energy_j_str
``` ```
With the throughput ceiling established, we can now apply the complete decision framework to a realistic scenario that exercises every step. With the throughput ceiling established, we can now apply the complete decision framework to a realistic scenario that exercises every step.
@@ -4099,11 +4184,23 @@ Engineers add attention to CNNs or convolutions to Transformers expecting additi
from mlsys.constants import A100_MEM_CAPACITY, GiB from mlsys.constants import A100_MEM_CAPACITY, GiB
# --- 8-GPU cluster memory --- class A100ClusterMemory:
a100_8x_mem_value = int(A100_MEM_CAPACITY.to(GiB).magnitude) * 8 """Contrast datacenter and edge memory: 8-GPU A100 node vs 4 GB edge device."""
# --- Outputs (formatted strings for prose) --- # ┌── 1. PARAMETERS (Inputs) ──────────────────────────────────────────────
a100_8x_mem_str = f"{a100_8x_mem_value}" # e.g. "640" n_gpus = 8
# ┌── 2. CALCULATION (The Physics) ────────────────────────────────────────
a100_8x_mem = int(A100_MEM_CAPACITY.m_as(GiB)) * n_gpus
# ┌── 3. INVARIANTS (Guardrails) ──────────────────────────────────────────
check(a100_8x_mem > 400, "8x A100 cluster should provide >400 GiB memory.")
# ┌── 4. OUTPUTS (Formatting) ─────────────────────────────────────────────
a100_8x_mem_str = f"{a100_8x_mem}" # e.g. "640"
# ┌── EXPORTS (Bridge to Text) ─────────────────────────────────────────────────
a100_8x_mem_str = A100ClusterMemory.a100_8x_mem_str
``` ```
**Pitfall:** *Optimizing architectural decisions for training hardware without considering deployment constraints.* **Pitfall:** *Optimizing architectural decisions for training hardware without considering deployment constraints.*

File diff suppressed because it is too large Load Diff

View File

@@ -26,7 +26,6 @@ start_chapter("vol1:model_compression")
::: :::
## Purpose {.unnumbered} ## Purpose {.unnumbered}
\begin{marginfigure} \begin{marginfigure}
@@ -78,102 +77,137 @@ Bridging that gap requires a systematic discipline of *compression*: trading cap
from mlsys.constants import * from mlsys.constants import *
from mlsys.formatting import fmt, check, sci from mlsys.formatting import fmt, check, sci
# --- Inputs (GPU specs) --- class CompressionSetup:
a100_tflops_fp16_value = A100_FLOPS_FP16_TENSOR.to(TFLOPs / second).magnitude """Chapter-wide constants: GPU specs, energy physics, model sizes, device constraints."""
a100_tflops_int8_value = A100_FLOPS_INT8.to(TFLOPs / second).magnitude
a100_bw_tbs_value = A100_MEM_BW.to(TB / second).magnitude
a100_int8_speedup_value = int(a100_tflops_int8_value / a100_tflops_fp16_value)
# --- Inputs (energy/perf illustrative values) --- # ┌── 1. PARAMETERS (Inputs) ──────────────────────────────────────────────
int8_energy_reduction_value = 20 # Illustrative energy/perf values
mobilenet_int8_mj_value = 47 int8_energy_reduction = 20
mobilenet_fp32_mj_value = 312 mobilenet_int8_mj = 47
tpu_v4_tops_per_w_value = 0.9 mobilenet_fp32_mj = 312
v100_tops_per_w_value = 0.3 tpu_v4_tops_per_w = 0.9
bandwidth_bound_speedup_value = 4 v100_tops_per_w = 0.3
bandwidth_bound_speedup = 4
llm_7b_params = 7
gpt3_training_flops_exp = 23
# --- Inputs (energy: multiply-add operations from constants) --- # ┌── 2. CALCULATION (The Physics) ────────────────────────────────────────
energy_dram_value = ENERGY_DRAM_ACCESS_PJ.magnitude # A100 specs
energy_dram_per_byte_value = ENERGY_DRAM_PJ_PER_BYTE.magnitude a100_tflops_fp16 = A100_FLOPS_FP16_TENSOR.m_as(TFLOPs / second)
energy_flop_fp32_value = ENERGY_FLOP_FP32_PJ.magnitude a100_tflops_int8 = A100_FLOPS_INT8.m_as(TFLOPs / second)
energy_flop_int8_value = ENERGY_FLOP_INT8_PJ.magnitude a100_bw_tbs = A100_MEM_BW.m_as(TB / second)
a100_int8_speedup = int(a100_tflops_int8 / a100_tflops_fp16)
# Energy for addition operations (Horowitz 2014, 45nm process) # Energy from constants (Horowitz 2014, 45nm process)
energy_add_fp32_pj_value = ENERGY_ADD_FP32_PJ.to(ureg.picojoule).magnitude energy_dram = ENERGY_DRAM_ACCESS_PJ.m_as(ureg.picojoule)
energy_add_fp16_pj_value = ENERGY_ADD_FP16_PJ.to(ureg.picojoule).magnitude energy_dram_per_byte = ENERGY_DRAM_PJ_PER_BYTE.m_as(ureg.picojoule / ureg.byte)
energy_add_int32_pj_value = ENERGY_ADD_INT32_PJ.to(ureg.picojoule).magnitude energy_flop_fp32 = ENERGY_FLOP_FP32_PJ.m_as(ureg.picojoule / ureg.count)
energy_add_int8_pj_value = ENERGY_ADD_INT8_PJ.to(ureg.picojoule).magnitude energy_flop_int8 = ENERGY_FLOP_INT8_PJ.m_as(ureg.picojoule / ureg.count)
energy_mul_fp32_pj_value = ENERGY_FLOP_FP32_PJ.magnitude energy_add_fp32_pj = ENERGY_ADD_FP32_PJ.m_as(ureg.picojoule)
energy_add_fp16_pj = ENERGY_ADD_FP16_PJ.m_as(ureg.picojoule)
energy_add_int32_pj = ENERGY_ADD_INT32_PJ.m_as(ureg.picojoule)
energy_add_int8_pj = ENERGY_ADD_INT8_PJ.m_as(ureg.picojoule)
energy_mul_fp32_pj = ENERGY_FLOP_FP32_PJ.m_as(ureg.picojoule / ureg.count)
# INT8 vs FP32 energy ratio (MAC-to-MAC: multiply + add for each precision) # INT8 vs FP32 MAC energy ratio
fp32_mac_pj_value = energy_mul_fp32_pj_value + energy_add_fp32_pj_value # 3.7 + 0.9 = 4.6 pJ fp32_mac_pj = energy_mul_fp32_pj + energy_add_fp32_pj # 3.7 + 0.9 = 4.6 pJ
int8_mac_pj_value = energy_flop_int8_value + energy_add_int8_pj_value # 0.2 + 0.03 = 0.23 pJ int8_mac_pj = energy_flop_int8 + energy_add_int8_pj # 0.2 + 0.03 = 0.23 pJ
int8_fp32_energy_ratio_value = fp32_mac_pj_value / int8_mac_pj_value int8_fp32_energy_ratio = fp32_mac_pj / int8_mac_pj
# V100 specs # V100 specs
v100_bw_gbs_value = V100_MEM_BW.to(GB / second).magnitude v100_bw_gbs = V100_MEM_BW.m_as(GB / second)
v100_tflops_fp32_value = V100_FLOPS_FP32.to(TFLOPs / second).magnitude v100_tflops_fp32 = V100_FLOPS_FP32.m_as(TFLOPs / second)
# Model specs # Model specs
resnet_params_m_value = RESNET50_PARAMS.to(Mparam).magnitude resnet_params_m = RESNET50_PARAMS.m_as(Mparam)
resnet_gflops_value = RESNET50_FLOPs.to(GFLOPs).magnitude resnet_gflops = RESNET50_FLOPs.m_as(GFLOPs)
mobilenetv2_mflops_value = MOBILENETV2_FLOPs.to(GFLOPs).magnitude * 1000 mobilenetv2_mflops = MOBILENETV2_FLOPs.m_as(GFLOPs) * 1000
# LLM parameter/memory calculations # LLM memory
llm_7b_params_value = 7 llm_7b_mem_fp16_gb = llm_7b_params * 2
llm_7b_mem_fp16_gb_value = llm_7b_params_value * 2 llm_175b_params = GPT3_PARAMS.m_as(Bparam)
llm_175b_params_value = GPT3_PARAMS.to(Bparam).magnitude llm_175b_mem_fp16_gb = llm_175b_params * 2
llm_175b_mem_fp16_gb_value = llm_175b_params_value * 2
# Device memory constraints # Device memory
smartphone_ram_gb_value = SMARTPHONE_RAM_GB.to(GB).magnitude smartphone_ram_gb = SMARTPHONE_RAM_GB.m_as(GB)
mcu_ram_kb_value = MCU_RAM_KIB.to(KiB).magnitude mcu_ram_kb = MCU_RAM_KIB.m_as(KiB)
# GPT-3 training FLOPs # ┌── 3. INVARIANTS (Guardrails) ──────────────────────────────────────────
gpt3_training_flops_exp_value = 23 check(a100_int8_speedup >= 2, "A100 INT8 should be at least 2x faster than FP16.")
check(int8_fp32_energy_ratio > 1, "FP32 MAC must cost more energy than INT8 MAC.")
# --- Outputs (formatted strings for prose) --- # ┌── 4. OUTPUTS (Formatting) ─────────────────────────────────────────────
a100_tflops_fp16_str = fmt(a100_tflops_fp16_value, precision=0, commas=False) a100_tflops_fp16_str = fmt(a100_tflops_fp16, precision=0, commas=False)
a100_tflops_int8_str = fmt(a100_tflops_int8_value, precision=0, commas=False) a100_tflops_int8_str = fmt(a100_tflops_int8, precision=0, commas=False)
a100_bw_tbs_str = fmt(a100_bw_tbs_value, precision=1, commas=False) a100_bw_tbs_str = fmt(a100_bw_tbs, precision=1, commas=False)
a100_int8_speedup_str = fmt(a100_int8_speedup_value, precision=0, commas=False) a100_int8_speedup_str = fmt(a100_int8_speedup, precision=0, commas=False)
int8_energy_reduction_str = fmt(int8_energy_reduction_value, precision=0, commas=False) int8_energy_reduction_str = fmt(int8_energy_reduction, precision=0, commas=False)
mobilenet_int8_mj_str = fmt(mobilenet_int8_mj_value, precision=0, commas=False) mobilenet_int8_mj_str = fmt(mobilenet_int8_mj, precision=0, commas=False)
mobilenet_fp32_mj_str = fmt(mobilenet_fp32_mj_value, precision=0, commas=False) mobilenet_fp32_mj_str = fmt(mobilenet_fp32_mj, precision=0, commas=False)
tpu_v4_tops_per_w_str = fmt(tpu_v4_tops_per_w_value, precision=1, commas=False) tpu_v4_tops_per_w_str = fmt(tpu_v4_tops_per_w, precision=1, commas=False)
v100_tops_per_w_str = fmt(v100_tops_per_w_value, precision=1, commas=False) v100_tops_per_w_str = fmt(v100_tops_per_w, precision=1, commas=False)
bandwidth_bound_speedup_str = fmt(bandwidth_bound_speedup_value, precision=0, commas=False) bandwidth_bound_speedup_str = fmt(bandwidth_bound_speedup, precision=0, commas=False)
energy_dram_str = fmt(energy_dram, precision=0, commas=False)
energy_dram_per_byte_str = fmt(energy_dram_per_byte, precision=0, commas=False)
energy_flop_fp32_str = f"{energy_flop_fp32}"
energy_flop_int8_str = f"{energy_flop_int8}"
energy_add_fp32_str = f"{energy_add_fp32_pj}"
energy_add_fp16_str = f"{energy_add_fp16_pj}"
energy_add_int32_str = f"{energy_add_int32_pj}"
energy_add_int8_str = f"{energy_add_int8_pj}"
energy_mul_fp32_str = f"{energy_mul_fp32_pj}"
int8_fp32_energy_ratio_str = fmt(int8_fp32_energy_ratio, precision=1, commas=False)
v100_bw_gbs_str = fmt(v100_bw_gbs, precision=0, commas=False)
v100_tflops_fp32_str = fmt(v100_tflops_fp32, precision=1, commas=False)
resnet_params_m_str = fmt(resnet_params_m, precision=1, commas=False)
resnet_gflops_str = fmt(resnet_gflops, precision=1, commas=False)
mobilenetv2_mflops_str = fmt(mobilenetv2_mflops, precision=0, commas=False)
llm_7b_str = f"{llm_7b_params}"
llm_7b_mem_str = fmt(llm_7b_mem_fp16_gb, precision=0, commas=False)
llm_175b_str = fmt(llm_175b_params, precision=0, commas=False)
llm_175b_mem_str = fmt(llm_175b_mem_fp16_gb, precision=0, commas=False)
smartphone_ram_str = f"{smartphone_ram_gb}"
mcu_ram_str = f"{mcu_ram_kb}"
gpt3_training_flops_str = f"$3.14 \\times 10^{{{gpt3_training_flops_exp}}}$"
energy_dram_str = fmt(energy_dram_value, precision=0, commas=False) # ┌── EXPORTS (Bridge to Text) ─────────────────────────────────────────────────
energy_dram_per_byte_str = fmt(energy_dram_per_byte_value, precision=0, commas=False) a100_tflops_fp16_str = CompressionSetup.a100_tflops_fp16_str
energy_flop_fp32_str = f"{energy_flop_fp32_value}" a100_tflops_int8_str = CompressionSetup.a100_tflops_int8_str
energy_flop_int8_str = f"{energy_flop_int8_value}" a100_bw_tbs_str = CompressionSetup.a100_bw_tbs_str
a100_int8_speedup_str = CompressionSetup.a100_int8_speedup_str
energy_add_fp32_str = f"{energy_add_fp32_pj_value}" int8_energy_reduction_str = CompressionSetup.int8_energy_reduction_str
energy_add_fp16_str = f"{energy_add_fp16_pj_value}" mobilenet_int8_mj_str = CompressionSetup.mobilenet_int8_mj_str
energy_add_int32_str = f"{energy_add_int32_pj_value}" mobilenet_fp32_mj_str = CompressionSetup.mobilenet_fp32_mj_str
energy_add_int8_str = f"{energy_add_int8_pj_value}" tpu_v4_tops_per_w_str = CompressionSetup.tpu_v4_tops_per_w_str
energy_mul_fp32_str = f"{energy_mul_fp32_pj_value}" v100_tops_per_w_str = CompressionSetup.v100_tops_per_w_str
bandwidth_bound_speedup_str = CompressionSetup.bandwidth_bound_speedup_str
int8_fp32_energy_ratio_str = fmt(int8_fp32_energy_ratio_value, precision=1, commas=False) energy_dram_str = CompressionSetup.energy_dram_str
energy_dram_per_byte_str = CompressionSetup.energy_dram_per_byte_str
v100_bw_gbs_str = fmt(v100_bw_gbs_value, precision=0, commas=False) energy_flop_fp32_str = CompressionSetup.energy_flop_fp32_str
v100_tflops_fp32_str = fmt(v100_tflops_fp32_value, precision=1, commas=False) energy_flop_int8_str = CompressionSetup.energy_flop_int8_str
energy_add_fp32_str = CompressionSetup.energy_add_fp32_str
resnet_params_m_str = fmt(resnet_params_m_value, precision=1, commas=False) energy_add_fp16_str = CompressionSetup.energy_add_fp16_str
resnet_gflops_str = fmt(resnet_gflops_value, precision=1, commas=False) energy_add_int32_str = CompressionSetup.energy_add_int32_str
mobilenetv2_mflops_str = fmt(mobilenetv2_mflops_value, precision=0, commas=False) energy_add_int8_str = CompressionSetup.energy_add_int8_str
energy_mul_fp32_str = CompressionSetup.energy_mul_fp32_str
llm_7b_str = f"{llm_7b_params_value}" int8_fp32_energy_ratio_str = CompressionSetup.int8_fp32_energy_ratio_str
llm_7b_mem_str = fmt(llm_7b_mem_fp16_gb_value, precision=0, commas=False) v100_bw_gbs_str = CompressionSetup.v100_bw_gbs_str
llm_175b_str = fmt(llm_175b_params_value, precision=0, commas=False) v100_tflops_fp32_str = CompressionSetup.v100_tflops_fp32_str
llm_175b_mem_str = fmt(llm_175b_mem_fp16_gb_value, precision=0, commas=False) resnet_params_m_str = CompressionSetup.resnet_params_m_str
smartphone_ram_str = f"{smartphone_ram_gb_value}" resnet_gflops_str = CompressionSetup.resnet_gflops_str
mcu_ram_str = f"{mcu_ram_kb_value}" mobilenetv2_mflops_str = CompressionSetup.mobilenetv2_mflops_str
gpt3_training_flops_str = f"$3.14 \\times 10^{{{gpt3_training_flops_exp_value}}}$" llm_7b_str = CompressionSetup.llm_7b_str
llm_7b_mem_str = CompressionSetup.llm_7b_mem_str
llm_175b_str = CompressionSetup.llm_175b_str
llm_175b_mem_str = CompressionSetup.llm_175b_mem_str
smartphone_ram_str = CompressionSetup.smartphone_ram_str
mcu_ram_str = CompressionSetup.mcu_ram_str
gpt3_training_flops_str = CompressionSetup.gpt3_training_flops_str
# Note: v100_bw_gbs_value used by downstream fusion-calc cell
v100_bw_gbs_value = CompressionSetup.v100_bw_gbs
v100_tflops_fp32_value = CompressionSetup.v100_tflops_fp32
``` ```
## Optimization Framework {#sec-model-compression-optimization-framework-9e21} ## Optimization Framework {#sec-model-compression-optimization-framework-9e21}
A `{python} llm_7b_str`-billion parameter language model requires `{python} llm_7b_mem_str` GB just to store its weights in FP16. Your deployment target is a smartphone with `{python} smartphone_ram_str` GB of RAM shared across the operating system, applications, and your model. *The math does not work.* No amount of clever engineering changes this arithmetic: `{python} llm_7b_mem_str` GB cannot fit in `{python} smartphone_ram_str` GB. Yet users expect the model to run: responsively, offline, without draining their battery in an hour. The gap between what training produces and what deployment permits (the Latency Budget, the maximum allowable end-to-end inference time, defined formally in @sec-model-serving) is not a minor inconvenience but a defining challenge of model compression. A `{python} llm_7b_str`-billion parameter language model requires `{python} llm_7b_mem_str` GB just to store its weights in FP16. Your deployment target is a smartphone with `{python} smartphone_ram_str` GB of RAM shared across the operating system, applications, and your model. *The math does not work.* No amount of clever engineering changes this arithmetic: `{python} llm_7b_mem_str` GB cannot fit in `{python} smartphone_ram_str` GB. Yet users expect the model to run: responsively, offline, without draining their battery in an hour. The gap between what training produces and what deployment permits (the Latency Budget, the maximum allowable end-to-end inference time, defined formally in @sec-model-serving) is not a minor inconvenience but a defining challenge of model compression.
@@ -420,7 +454,6 @@ We call this phenomenon *the quantization speedup*.
The relative importance of each dimension varies by deployment target. Cloud systems may tolerate larger models but demand throughput; mobile devices prioritize memory and energy; embedded systems face hard constraints on all resources simultaneously. Understanding these deployment contexts shapes which optimization dimensions to prioritize. The relative importance of each dimension varies by deployment target. Cloud systems may tolerate larger models but demand throughput; mobile devices prioritize memory and energy; embedded systems face hard constraints on all resources simultaneously. Understanding these deployment contexts shapes which optimization dimensions to prioritize.
## Deployment Context {#sec-model-compression-deployment-context-0d88} ## Deployment Context {#sec-model-compression-deployment-context-0d88}
The optimization framework above identifies three dimensions of compression, but which dimensions matter most depends entirely on where the model will run. A datacenter GPU with 80 GB of HBM faces different binding constraints than a smartphone with shared RAM or a microcontroller with 256 KB of SRAM. @tbl-deployment-scenarios summarizes the key constraints across deployment environments. The optimization framework above identifies three dimensions of compression, but which dimensions matter most depends entirely on where the model will run. A datacenter GPU with 80 GB of HBM faces different binding constraints than a smartphone with shared RAM or a microcontroller with 256 KB of SRAM. @tbl-deployment-scenarios summarizes the key constraints across deployment environments.
@@ -482,55 +515,80 @@ from mlsys.constants import (GB, GiB, MiB, KiB, MB, KB, byte,
CLOUD_MEM_GIB, MOBILE_MEM_GIB, TINY_MEM_KIB, CLOUD_MEM_GIB, MOBILE_MEM_GIB, TINY_MEM_KIB,
DLRM_MODEL_SIZE_FP32) DLRM_MODEL_SIZE_FP32)
# --- Inputs (device capacities and model sizes) --- def _get_ratio(model_mem, device_mem):
cloud_mem_value = CLOUD_MEM_GIB """Return 'ok' if model fits, else 'no (Nx)' with how many times it overflows."""
mobile_mem_value = MOBILE_MEM_GIB ratio = model_mem.m_as(byte) / device_mem.m_as(byte)
tiny_mem_value = TINY_MEM_KIB
dlrm_mem_value = DLRM_MODEL_SIZE_FP32
gpt2_mem_value = 6 * GiB
resnet_mem_value = 100 * MiB
mobilenet_mem_value = 14 * MiB
mobilenet_int8_mem_value = 3.5 * MiB
dscnn_mem_value = 500 * KiB
# --- Process (compute fit ratios) ---
def get_ratio(model_mem, device_mem):
ratio = model_mem.to(byte).magnitude / device_mem.to(byte).magnitude
if ratio < 1: if ratio < 1:
return "ok" return "ok"
return f"no ({ratio:.0f}x)" return f"no ({ratio:.0f}x)"
dlrm_mobile_value = get_ratio(dlrm_mem_value, mobile_mem_value) class ModelDeviceComparison:
dlrm_tiny_value = get_ratio(dlrm_mem_value, tiny_mem_value) """Contrast model requirements with device memory: 6-order-of-magnitude deployment gap."""
gpt2_mobile_value = get_ratio(gpt2_mem_value, mobile_mem_value) # ┌── 1. PARAMETERS (Inputs) ──────────────────────────────────────────────
gpt2_tiny_value = get_ratio(gpt2_mem_value, tiny_mem_value) # Device capacities
cloud_mem = CLOUD_MEM_GIB
mobile_mem = MOBILE_MEM_GIB
tiny_mem = TINY_MEM_KIB
resnet_tiny_value = get_ratio(resnet_mem_value, tiny_mem_value) # Model sizes
mobilenet_tiny_value = get_ratio(mobilenet_mem_value, tiny_mem_value) dlrm_mem = DLRM_MODEL_SIZE_FP32
mobilenet_int8_tiny_value = get_ratio(mobilenet_int8_mem_value, tiny_mem_value) gpt2_mem = 6 * GiB
resnet_mem = 100 * MiB
mobilenet_mem = 14 * MiB
mobilenet_int8_mem = 3.5 * MiB
dscnn_mem = 500 * KiB
# --- Outputs (formatted strings for prose) --- # ┌── 2. CALCULATION (The Physics) ────────────────────────────────────────
dlrm_str = f"{dlrm_mem_value.to(GB).magnitude:.0f} GB" dlrm_mobile = _get_ratio(dlrm_mem, mobile_mem)
gpt2_str = f"{gpt2_mem_value.to(GiB).magnitude:.0f} GB" dlrm_tiny = _get_ratio(dlrm_mem, tiny_mem)
resnet_str = f"{resnet_mem_value.to(MiB).magnitude:.0f} MB" gpt2_mobile = _get_ratio(gpt2_mem, mobile_mem)
mobilenet_str = f"{mobilenet_mem_value.to(MiB).magnitude:.0f} MB" gpt2_tiny = _get_ratio(gpt2_mem, tiny_mem)
mobilenet_int8_str = f"{mobilenet_int8_mem_value.to(MiB).magnitude:.1f} MB" resnet_tiny = _get_ratio(resnet_mem, tiny_mem)
dscnn_str = f"{dscnn_mem_value.to(KiB).magnitude:.0f} KB" mobilenet_tiny = _get_ratio(mobilenet_mem, tiny_mem)
mobilenet_int8_tiny = _get_ratio(mobilenet_int8_mem, tiny_mem)
cloud_cap_str = f"~{cloud_mem_value.to(GiB).magnitude:.0f} GB" # ┌── 3. INVARIANTS (Guardrails) ──────────────────────────────────────────
mobile_cap_str = f"~{mobile_mem_value.to(GiB).magnitude:.0f} GB" # DS-CNN always fits TinyML — sanity check
tiny_cap_str = f"~{tiny_mem_value.to(KiB).magnitude:.0f} KB" assert _get_ratio(dscnn_mem, tiny_mem) == "ok", "DS-CNN must fit in TinyML device."
dlrm_mobile_str = dlrm_mobile_value # ┌── 4. OUTPUTS (Formatting) ─────────────────────────────────────────────
dlrm_tiny_str = dlrm_tiny_value dlrm_str = f"{dlrm_mem.m_as(GB):.0f} GB"
gpt2_mobile_str = gpt2_mobile_value gpt2_str = f"{gpt2_mem.m_as(GiB):.0f} GB"
gpt2_tiny_str = gpt2_tiny_value resnet_str = f"{resnet_mem.m_as(MiB):.0f} MB"
resnet_tiny_str = resnet_tiny_value mobilenet_str = f"{mobilenet_mem.m_as(MiB):.0f} MB"
mobilenet_tiny_str = mobilenet_tiny_value mobilenet_int8_str = f"{mobilenet_int8_mem.m_as(MiB):.1f} MB"
mobilenet_int8_tiny_str = mobilenet_int8_tiny_value dscnn_str = f"{dscnn_mem.m_as(KiB):.0f} KB"
dscnn_tiny_str = "ok" cloud_cap_str = f"~{cloud_mem.m_as(GiB):.0f} GB"
mobile_cap_str = f"~{mobile_mem.m_as(GiB):.0f} GB"
tiny_cap_str = f"~{tiny_mem.m_as(KiB):.0f} KB"
dlrm_mobile_str = dlrm_mobile
dlrm_tiny_str = dlrm_tiny
gpt2_mobile_str = gpt2_mobile
gpt2_tiny_str = gpt2_tiny
resnet_tiny_str = resnet_tiny
mobilenet_tiny_str = mobilenet_tiny
mobilenet_int8_tiny_str = mobilenet_int8_tiny
dscnn_tiny_str = "ok"
# ┌── EXPORTS (Bridge to Text) ─────────────────────────────────────────────────
dlrm_str = ModelDeviceComparison.dlrm_str
gpt2_str = ModelDeviceComparison.gpt2_str
resnet_str = ModelDeviceComparison.resnet_str
mobilenet_str = ModelDeviceComparison.mobilenet_str
mobilenet_int8_str = ModelDeviceComparison.mobilenet_int8_str
dscnn_str = ModelDeviceComparison.dscnn_str
cloud_cap_str = ModelDeviceComparison.cloud_cap_str
mobile_cap_str = ModelDeviceComparison.mobile_cap_str
tiny_cap_str = ModelDeviceComparison.tiny_cap_str
dlrm_mobile_str = ModelDeviceComparison.dlrm_mobile_str
dlrm_tiny_str = ModelDeviceComparison.dlrm_tiny_str
gpt2_mobile_str = ModelDeviceComparison.gpt2_mobile_str
gpt2_tiny_str = ModelDeviceComparison.gpt2_tiny_str
resnet_tiny_str = ModelDeviceComparison.resnet_tiny_str
mobilenet_tiny_str = ModelDeviceComparison.mobilenet_tiny_str
mobilenet_int8_tiny_str = ModelDeviceComparison.mobilenet_int8_tiny_str
dscnn_tiny_str = ModelDeviceComparison.dscnn_tiny_str
``` ```
| **Model** | **Memory** **(Runtime)** | **Storage** **(Weights)** | **Cloud** **(`{python} cloud_cap_str`)** | **Mobile** **(`{python} mobile_cap_str`)** | **TinyML** **(`{python} tiny_cap_str`)** | | **Model** | **Memory** **(Runtime)** | **Storage** **(Weights)** | **Cloud** **(`{python} cloud_cap_str`)** | **Mobile** **(`{python} mobile_cap_str`)** | **TinyML** **(`{python} tiny_cap_str`)** |
@@ -600,7 +658,6 @@ Optimization is about trading one resource for another.
Each deployment context above imposes a binding constraint: memory capacity on mobile devices, latency on real-time systems, energy on battery-powered sensors. The optimization techniques that follow address these constraints at three successive levels of the stack. We begin with structural methods that modify *what* computations occur, reducing the model's parameter count and operation count to fit tighter memory and compute budgets. We then turn to precision techniques that reduce how many bits represent each value, directly shrinking memory footprint and accelerating arithmetic. Finally, we address architectural approaches that improve how efficiently the remaining operations execute on physical hardware, closing the gap between theoretical savings and measured performance. Each deployment context above imposes a binding constraint: memory capacity on mobile devices, latency on real-time systems, energy on battery-powered sensors. The optimization techniques that follow address these constraints at three successive levels of the stack. We begin with structural methods that modify *what* computations occur, reducing the model's parameter count and operation count to fit tighter memory and compute budgets. We then turn to precision techniques that reduce how many bits represent each value, directly shrinking memory footprint and accelerating arithmetic. Finally, we address architectural approaches that improve how efficiently the remaining operations execute on physical hardware, closing the gap between theoretical savings and measured performance.
## Structural Optimization {#sec-model-compression-structural-optimization-ee93} ## Structural Optimization {#sec-model-compression-structural-optimization-ee93}
\index{Model Compression!structural optimization} \index{Model Compression!structural optimization}
@@ -2764,7 +2821,6 @@ Test your understanding of the structural optimization techniques covered so far
- [ ] Can you identify when to choose Neural Architecture Search over manual architecture design? Consider the trade-offs in computational cost, design space coverage, and hardware-specific optimization. - [ ] Can you identify when to choose Neural Architecture Search over manual architecture design? Consider the trade-offs in computational cost, design space coverage, and hardware-specific optimization.
::: :::
## Quantization and Precision {#sec-model-compression-quantization-precision-cd46} ## Quantization and Precision {#sec-model-compression-quantization-precision-cd46}
\index{Model Compression!precision optimization} \index{Model Compression!precision optimization}
@@ -3690,44 +3746,57 @@ Compare the two mapping diagrams side by side in @fig-calibration-ranges. Symmet
# │ zero_point_str, x_val_str, x_q_str, x_recon_str # │ zero_point_str, x_val_str, x_q_str, x_recon_str
# └───────────────────────────────────────────────────────────────────────────── # └─────────────────────────────────────────────────────────────────────────────
from mlsys.formatting import fmt, check from mlsys.formatting import fmt, check
from mlsys.constants import KIB_TO_BYTES
# --- Inputs (activation range example) --- class QuantizationMathCalc:
alpha_value = -1.0 """Derive affine quantization parameters: scale and zero-point for [-1.0, 3.0] → UINT8."""
beta_value = 3.0
bits_value = 8
x_val_value = 0.0 # value to quantize
# --- Process (calculate affine parameters) --- # ┌── 1. PARAMETERS (Inputs) ──────────────────────────────────────────────
# 1. Calculate Scale (s) alpha = -1.0 # activation range min
# s = (beta - alpha) / (2^b - 1) beta = 3.0 # activation range max
int_steps_value = 2**bits_value - 1 bits = 8 # target bit-width
scale_value = (beta_value - alpha_value) / int_steps_value x_val = 0.0 # value to quantize
# 2. Calculate Zero-Point (z) # ┌── 2. CALCULATION (The Physics) ────────────────────────────────────────
# z = round(-alpha / s) # 1. Scale: s = (beta - alpha) / (2^b - 1)
# Note: z maps the real value 0.0 to an integer int_steps = 2**bits - 1
zero_point_value = round(-alpha_value / scale_value) scale = (beta - alpha) / int_steps
# 3. Quantize a value # 2. Zero-point: z = round(-alpha / s)
# x_q = clamp(round(x / s) + z, 0, 2^b - 1) zero_point = round(-alpha / scale)
x_q_raw = round(x_val_value / scale_value) + zero_point_value
x_q_value = max(0, min(int_steps_value, x_q_raw))
# 4. Dequantize (reconstruct) # 3. Quantize: x_q = clamp(round(x/s) + z, 0, 2^b - 1)
# x_recon = (x_q - z) * s x_q_raw = round(x_val / scale) + zero_point
x_recon_value = (x_q_value - zero_point_value) * scale_value x_q = max(0, min(int_steps, x_q_raw))
# --- Outputs (formatted strings for prose) --- # 4. Dequantize: x_recon = (x_q - z) * s
alpha_str = fmt(alpha_value, precision=1, commas=False) # "-1.0" x_recon = (x_q - zero_point) * scale
beta_str = fmt(beta_value, precision=1, commas=False) # "3.0"
range_str = fmt(beta_value - alpha_value, precision=1, commas=False) # "4.0" # ┌── 3. INVARIANTS (Guardrails) ──────────────────────────────────────────
steps_str = f"{int_steps_value}" # "255" check(scale > 0, "Scale must be positive.")
scale_str = fmt(scale_value, precision=4, commas=False) # "0.0157" check(0 <= zero_point <= int_steps, "Zero-point must be in valid integer range.")
zero_point_str = f"{int(zero_point_value)}" # "64" check(abs(x_recon - x_val) < scale, "Reconstruction error must be less than one step size.")
x_val_str = fmt(x_val_value, precision=1, commas=False) # "0.0"
x_q_str = f"{int(x_q_value)}" # "64" # ┌── 4. OUTPUTS (Formatting) ─────────────────────────────────────────────
x_recon_str = fmt(x_recon_value, precision=2, commas=False) # "0.00" alpha_str = fmt(alpha, precision=1, commas=False) # "-1.0"
beta_str = fmt(beta, precision=1, commas=False) # "3.0"
range_str = fmt(beta - alpha, precision=1, commas=False) # "4.0"
steps_str = f"{int_steps}" # "255"
scale_str = fmt(scale, precision=4, commas=False) # "0.0157"
zero_point_str = f"{int(zero_point)}" # "64"
x_val_str = fmt(x_val, precision=1, commas=False) # "0.0"
x_q_str = f"{int(x_q)}" # "64"
x_recon_str = fmt(x_recon, precision=2, commas=False) # "0.00"
# ┌── EXPORTS (Bridge to Text) ─────────────────────────────────────────────────
alpha_str = QuantizationMathCalc.alpha_str
beta_str = QuantizationMathCalc.beta_str
range_str = QuantizationMathCalc.range_str
steps_str = QuantizationMathCalc.steps_str
scale_str = QuantizationMathCalc.scale_str
zero_point_str = QuantizationMathCalc.zero_point_str
x_val_str = QuantizationMathCalc.x_val_str
x_q_str = QuantizationMathCalc.x_q_str
x_recon_str = QuantizationMathCalc.x_recon_str
``` ```
::: {.callout-notebook title="Calculating Scale and Zero-Point"} ::: {.callout-notebook title="Calculating Scale and Zero-Point"}
@@ -4326,7 +4395,6 @@ Yet practitioners often discover a frustrating gap between theory and practice:
The gap arises from several sources. Sparse matrices stored in dense format waste memory bandwidth loading zeros—the hardware cannot skip what it does not know is zero. Operations that could run in parallel execute sequentially due to data dependencies the compiler cannot resolve. Simple inputs receive the same computational budget as complex ones because the model has no mechanism to exit early. Closing the gap between "optimized on paper" and "optimized in practice" is the domain of our third optimization dimension: **architectural efficiency**. This dimension ensures that structural and precision optimizations translate into real-world speedups by aligning computation patterns with hardware capabilities. The gap arises from several sources. Sparse matrices stored in dense format waste memory bandwidth loading zeros—the hardware cannot skip what it does not know is zero. Operations that could run in parallel execute sequentially due to data dependencies the compiler cannot resolve. Simple inputs receive the same computational budget as complex ones because the model has no mechanism to exit early. Closing the gap between "optimized on paper" and "optimized in practice" is the domain of our third optimization dimension: **architectural efficiency**. This dimension ensures that structural and precision optimizations translate into real-world speedups by aligning computation patterns with hardware capabilities.
## Architectural Efficiency {#sec-model-compression-architectural-efficiency-8dd3} ## Architectural Efficiency {#sec-model-compression-architectural-efficiency-8dd3}
Architectural efficiency optimization ensures that computations execute efficiently on target hardware by aligning model operations with processor capabilities and memory hierarchies. Where representation optimization determines *what* computations to perform and precision optimization determines *how precisely* to compute, architectural efficiency addresses *how* operations are scheduled, memory is accessed, and workloads adapt to input characteristics. This third dimension closes the gap between theoretical compression ratios and real-world speedups. Architectural efficiency optimization ensures that computations execute efficiently on target hardware by aligning model operations with processor capabilities and memory hierarchies. Where representation optimization determines *what* computations to perform and precision optimization determines *how precisely* to compute, architectural efficiency addresses *how* operations are scheduled, memory is accessed, and workloads adapt to input characteristics. This third dimension closes the gap between theoretical compression ratios and real-world speedups.
@@ -4452,77 +4520,102 @@ Beyond reducing what data must be stored, substantial efficiency gains emerge fr
# │ kernels_fused_str, saved_latency_ms_str # │ kernels_fused_str, saved_latency_ms_str
# └───────────────────────────────────────────────────────────────────────────── # └─────────────────────────────────────────────────────────────────────────────
from mlsys.formatting import fmt, check from mlsys.formatting import fmt, check
from mlsys.constants import KIB_TO_BYTES from mlsys.constants import KIB_TO_BYTES, MILLION
# --- Inputs (Conv-BN-ReLU) --- class FusionCalc:
conv_channels_value = 256 """Quantify latency and bandwidth benefits of Conv-BN-ReLU operator fusion on ResNet-50."""
conv_spatial_value = 28
bytes_per_element_value = 4
# GEMM # ┌── 1. PARAMETERS (Inputs) ──────────────────────────────────────────────
gemm_hidden_value = 768 # Conv-BN-ReLU layer geometry
gemm_seq_value = 512 conv_channels = 256
conv_spatial = 28
bytes_per_element = 4 # FP32
# Memory Bandwidth Analysis (ResNet-50 layer) # GEMM geometry
# Feature map: 256 channels × 28 × 28 spatial × 4 bytes/element (FP32) gemm_hidden = 768
feat_map_mb_value = conv_channels_value * conv_spatial_value * conv_spatial_value * bytes_per_element_value / MILLION # SI MB gemm_seq = 512
weights_mb_value = 2.4
bn_params_mb_value = 0.002
# Kernel Launch # ResNet-50 layer memory baseline
kernels_unfused_value = 159 weights_mb = 2.4
kernels_fused_value = 53 bn_params_mb = 0.002
latency_per_kernel_us_value = 10
# --- Process --- # Kernel launch overhead
# Conv-BN-ReLU intermediate kernels_unfused = 159
conv_bn_relu_intermediate_bytes = 2 * conv_channels_value * conv_spatial_value * conv_spatial_value * bytes_per_element_value kernels_fused = 53
conv_bn_relu_intermediate_mb_value = conv_bn_relu_intermediate_bytes / (1024**2) latency_per_kernel_us = 10
# GEMM intermediate # ┌── 2. CALCULATION (The Physics) ────────────────────────────────────────
gemm_intermediate_bytes = gemm_hidden_value * gemm_seq_value * bytes_per_element_value # Feature map size (SI MB)
gemm_intermediate_mb_value = gemm_intermediate_bytes / (1024**2) feat_map_mb = conv_channels * conv_spatial * conv_spatial * bytes_per_element / MILLION
# Bandwidth Analysis # Conv-BN-ReLU intermediate (2 feature maps written: conv→BN boundary)
unfused_conv_mb_value = feat_map_mb_value * 2 + weights_mb_value conv_bn_relu_intermediate_mb = (
unfused_bn_mb_value = feat_map_mb_value * 2 + bn_params_mb_value 2 * conv_channels * conv_spatial * conv_spatial * bytes_per_element / (1024**2)
unfused_relu_mb_value = feat_map_mb_value * 2 )
total_unfused_mb_value = unfused_conv_mb_value + unfused_bn_mb_value + unfused_relu_mb_value
total_fused_mb_value = feat_map_mb_value * 2 + weights_mb_value # GEMM intermediate
bandwidth_reduction_pct_value = (1 - total_fused_mb_value / total_unfused_mb_value) * 100 gemm_intermediate_mb = gemm_hidden * gemm_seq * bytes_per_element / (1024**2)
# Kernel Launch # Unfused bandwidth: Conv (feat*2 + weights) + BN (feat*2 + bn) + ReLU (feat*2)
saved_latency_us_value = (kernels_unfused_value - kernels_fused_value) * latency_per_kernel_us_value unfused_conv_mb = feat_map_mb * 2 + weights_mb
saved_latency_ms_value = saved_latency_us_value / 1000 unfused_bn_mb = feat_map_mb * 2 + bn_params_mb
unfused_relu_mb = feat_map_mb * 2
total_unfused_mb = unfused_conv_mb + unfused_bn_mb + unfused_relu_mb
# V100 timing analysis (memory-bound) # Fused bandwidth: read input + weights once, write output once
v100_bw_gbs_local_value = v100_bw_gbs_value # from earlier cell total_fused_mb = feat_map_mb * 2 + weights_mb
unfused_time_us_value = total_unfused_mb_value / v100_bw_gbs_local_value * 1000 # MB / (GB/s) * 1000 = us bandwidth_reduction_pct = (1 - total_fused_mb / total_unfused_mb) * 100
fused_time_us_value = total_fused_mb_value / v100_bw_gbs_local_value * 1000
fusion_speedup_value = unfused_time_us_value / fused_time_us_value
# --- Outputs (formatted strings for prose) --- # Kernel launch savings
conv_bn_relu_intermediate_mb_str = fmt(conv_bn_relu_intermediate_mb_value, precision=1, commas=False) saved_latency_us = (kernels_unfused - kernels_fused) * latency_per_kernel_us
gemm_intermediate_mb_str = fmt(gemm_intermediate_mb_value, precision=1, commas=False) saved_latency_ms = saved_latency_us / 1000
feat_map_kb_str = fmt(feat_map_mb_value * 1000, precision=0, commas=False) # V100 timing (memory-bound): MB / (GB/s) * 1000 = µs
weights_mb_str = fmt(weights_mb_value, precision=1, commas=False) unfused_time_us = total_unfused_mb / v100_bw_gbs_value * 1000
bn_params_kb_str = fmt(bn_params_mb_value * KIB_TO_BYTES, precision=0, commas=False) fused_time_us = total_fused_mb / v100_bw_gbs_value * 1000
fusion_speedup = unfused_time_us / fused_time_us
unfused_conv_mb_str = fmt(unfused_conv_mb_value, precision=1, commas=False) # ┌── 3. INVARIANTS (Guardrails) ──────────────────────────────────────────
unfused_bn_mb_str = fmt(unfused_bn_mb_value, precision=1, commas=False) check(bandwidth_reduction_pct > 40, "Fusion should reduce bandwidth by more than 40%.")
unfused_relu_mb_str = fmt(unfused_relu_mb_value, precision=1, commas=False) check(fusion_speedup > 1, "Fused execution must be faster than unfused.")
total_unfused_mb_str = fmt(total_unfused_mb_value, precision=1, commas=False)
total_fused_mb_str = fmt(total_fused_mb_value, precision=1, commas=False)
bandwidth_reduction_pct_str = fmt(bandwidth_reduction_pct_value, precision=0, commas=False)
kernels_unfused_str = fmt(kernels_unfused_value, precision=0, commas=False) # ┌── 4. OUTPUTS (Formatting) ─────────────────────────────────────────────
kernels_fused_str = fmt(kernels_fused_value, precision=0, commas=False) conv_bn_relu_intermediate_mb_str = fmt(conv_bn_relu_intermediate_mb, precision=1, commas=False)
saved_latency_ms_str = fmt(saved_latency_ms_value, precision=0, commas=False) gemm_intermediate_mb_str = fmt(gemm_intermediate_mb, precision=1, commas=False)
unfused_time_us_str = fmt(unfused_time_us_value, precision=0, commas=False) feat_map_kb_str = fmt(feat_map_mb * 1000, precision=0, commas=False)
fused_time_us_str = fmt(fused_time_us_value, precision=1, commas=False) weights_mb_str = fmt(weights_mb, precision=1, commas=False)
fusion_speedup_str = fmt(fusion_speedup_value, precision=2, commas=False) bn_params_kb_str = fmt(bn_params_mb * KIB_TO_BYTES, precision=0, commas=False)
unfused_conv_mb_str = fmt(unfused_conv_mb, precision=1, commas=False)
unfused_bn_mb_str = fmt(unfused_bn_mb, precision=1, commas=False)
unfused_relu_mb_str = fmt(unfused_relu_mb, precision=1, commas=False)
total_unfused_mb_str = fmt(total_unfused_mb, precision=1, commas=False)
total_fused_mb_str = fmt(total_fused_mb, precision=1, commas=False)
bandwidth_reduction_pct_str = fmt(bandwidth_reduction_pct, precision=0, commas=False)
kernels_unfused_str = fmt(kernels_unfused, precision=0, commas=False)
kernels_fused_str = fmt(kernels_fused, precision=0, commas=False)
saved_latency_ms_str = fmt(saved_latency_ms, precision=0, commas=False)
unfused_time_us_str = fmt(unfused_time_us, precision=0, commas=False)
fused_time_us_str = fmt(fused_time_us, precision=1, commas=False)
fusion_speedup_str = fmt(fusion_speedup, precision=2, commas=False)
# ┌── EXPORTS (Bridge to Text) ─────────────────────────────────────────────────
conv_bn_relu_intermediate_mb_str = FusionCalc.conv_bn_relu_intermediate_mb_str
gemm_intermediate_mb_str = FusionCalc.gemm_intermediate_mb_str
feat_map_kb_str = FusionCalc.feat_map_kb_str
weights_mb_str = FusionCalc.weights_mb_str
bn_params_kb_str = FusionCalc.bn_params_kb_str
unfused_conv_mb_str = FusionCalc.unfused_conv_mb_str
unfused_bn_mb_str = FusionCalc.unfused_bn_mb_str
unfused_relu_mb_str = FusionCalc.unfused_relu_mb_str
total_unfused_mb_str = FusionCalc.total_unfused_mb_str
total_fused_mb_str = FusionCalc.total_fused_mb_str
bandwidth_reduction_pct_str = FusionCalc.bandwidth_reduction_pct_str
kernels_unfused_str = FusionCalc.kernels_unfused_str
kernels_fused_str = FusionCalc.kernels_fused_str
saved_latency_ms_str = FusionCalc.saved_latency_ms_str
unfused_time_us_str = FusionCalc.unfused_time_us_str
fused_time_us_str = FusionCalc.fused_time_us_str
fusion_speedup_str = FusionCalc.fusion_speedup_str
``` ```
#### Operator Fusion {#sec-model-compression-operator-fusion-ac1d} #### Operator Fusion {#sec-model-compression-operator-fusion-ac1d}
@@ -4594,16 +4687,28 @@ def conv_bn_relu_fused(input, weight, gamma, beta, mean, var):
# └───────────────────────────────────────────────────────────────────────────── # └─────────────────────────────────────────────────────────────────────────────
from mlsys.formatting import fmt, check, md_math from mlsys.formatting import fmt, check, md_math
# --- Inputs (transfer counts) --- class ConvFusionCalc:
unfused_transfers_value = 6 # read/write for each of conv, BN, ReLU """Demonstrate 3x memory traffic reduction from Conv-BN-ReLU fusion (6 transfers → 2)."""
fused_transfers_value = 2 # read input, write output
# --- Process --- # ┌── 1. PARAMETERS (Inputs) ──────────────────────────────────────────────
transfer_reduction_value = unfused_transfers_value / fused_transfers_value unfused_transfers = 6 # read/write for Conv, BN, ReLU
fused_transfers = 2 # read input, write output
# --- Outputs (formatted strings for prose) --- # ┌── 2. CALCULATION (The Physics) ────────────────────────────────────────
transfer_reduction_str = fmt(transfer_reduction_value, precision=0, commas=False) transfer_reduction = unfused_transfers / fused_transfers
conv_bn_relu_mem_md = md_math(f"2 \\times 256 \\times 28 \\times 28 \\times 4 \\text{{ bytes}} \\approx \\text{{{conv_bn_relu_intermediate_mb_str} MB}}")
# ┌── 3. INVARIANTS (Guardrails) ──────────────────────────────────────────
check(transfer_reduction == 3, "Conv-BN-ReLU fusion must yield exactly 3x transfer reduction.")
# ┌── 4. OUTPUTS (Formatting) ─────────────────────────────────────────────
transfer_reduction_str = fmt(transfer_reduction, precision=0, commas=False)
conv_bn_relu_mem_md = md_math(
f"2 \\times 256 \\times 28 \\times 28 \\times 4 \\text{{ bytes}} \\approx \\text{{{conv_bn_relu_intermediate_mb_str} MB}}"
)
# ┌── EXPORTS (Bridge to Text) ─────────────────────────────────────────────────
transfer_reduction_str = ConvFusionCalc.transfer_reduction_str
conv_bn_relu_mem_md = ConvFusionCalc.conv_bn_relu_mem_md
``` ```
The arithmetic operations remain identical, but memory traffic drops from 6 transfers to 2 transfers (`{python} transfer_reduction_str` $\times$ reduction). For a ResNet-50 layer with 256 channels and spatial size $28 \times 28$, this eliminates `{python} conv_bn_relu_mem_md` of intermediate memory traffic per layer. The arithmetic operations remain identical, but memory traffic drops from 6 transfers to 2 transfers (`{python} transfer_reduction_str` $\times$ reduction). For a ResNet-50 layer with 256 channels and spatial size $28 \times 28$, this eliminates `{python} conv_bn_relu_mem_md` of intermediate memory traffic per layer.
@@ -6276,7 +6381,6 @@ Unlike software functions that compose predictably, optimization techniques inte
With the three optimization dimensions now fully explored, practitioners need systematic guidance for translating this knowledge into deployment decisions. With the three optimization dimensions now fully explored, practitioners need systematic guidance for translating this knowledge into deployment decisions.
## Technique Selection {#sec-model-compression-technique-selection-ba16} ## Technique Selection {#sec-model-compression-technique-selection-ba16}
An engineer deploying a transformer model faces a concrete decision: the model exceeds the target device's memory by 3 $\times$, inference latency is 4 $\times$ above the SLO, and the power budget allows no more than 2 W sustained. Should she quantize first, prune first, distill to a smaller architecture, or combine techniques? The answer depends on which constraint is binding, what accuracy loss is tolerable, and how much engineering time is available. This section provides structured guidance for navigating that decision. An engineer deploying a transformer model faces a concrete decision: the model exceeds the target device's memory by 3 $\times$, inference latency is 4 $\times$ above the SLO, and the power budget allows no more than 2 W sustained. Should she quantize first, prune first, distill to a smaller architecture, or combine techniques? The answer depends on which constraint is binding, what accuracy loss is tolerable, and how much engineering time is available. This section provides structured guidance for navigating that decision.
@@ -6314,7 +6418,6 @@ These choices also depend on the available engineering budget. When fine-tuning
This decision framework provides starting points for individual technique selection. Validating that a chosen technique actually achieves its intended goal requires systematic profiling and measurement, which @sec-model-compression-efficiency-measurement-2424 formalizes in detail. However, production deployments rarely rely on a single technique. Combining pruning with quantization, or distillation with hardware-aware design, introduces interaction effects that can either amplify benefits or create unexpected accuracy degradation. The following section addresses how to sequence and combine techniques effectively. This decision framework provides starting points for individual technique selection. Validating that a chosen technique actually achieves its intended goal requires systematic profiling and measurement, which @sec-model-compression-efficiency-measurement-2424 formalizes in detail. However, production deployments rarely rely on a single technique. Combining pruning with quantization, or distillation with hardware-aware design, introduces interaction effects that can either amplify benefits or create unexpected accuracy degradation. The following section addresses how to sequence and combine techniques effectively.
## Optimization Strategies {#sec-model-compression-optimization-strategies-f2f6} ## Optimization Strategies {#sec-model-compression-optimization-strategies-f2f6}
The decision framework above guides individual technique selection, but the largest optimization gains emerge from combining multiple techniques. Because pruning, quantization, and architectural efficiency operate at different levels of the stack, they provide multiplicative benefits when sequenced appropriately. The decision framework above guides individual technique selection, but the largest optimization gains emerge from combining multiple techniques. Because pruning, quantization, and architectural efficiency operate at different levels of the stack, they provide multiplicative benefits when sequenced appropriately.
@@ -6528,7 +6631,6 @@ This example illustrates why sequencing matters: pruning first concentrates impo
With dozens of techniques across three optimization dimensions, rigorous measurement is essential for validating that optimizations achieve their intended goals. A practitioner who prunes, quantizes, and fuses without profiling the actual impact on target hardware is optimizing blindly. With dozens of techniques across three optimization dimensions, rigorous measurement is essential for validating that optimizations achieve their intended goals. A practitioner who prunes, quantizes, and fuses without profiling the actual impact on target hardware is optimizing blindly.
## Efficiency Measurement {#sec-model-compression-efficiency-measurement-2424} ## Efficiency Measurement {#sec-model-compression-efficiency-measurement-2424}
A model quantized to INT8 should be 4 $\times$ smaller and roughly 3 $\times$ faster, but does it actually achieve those gains on the target hardware? Theoretical compression ratios and measured deployment improvements often diverge, sometimes dramatically, because real speedups depend on memory hierarchy effects, kernel implementations, and hardware utilization patterns that theory alone cannot predict. Translating theoretical compression ratios into measurable deployment improvements therefore requires systematic profiling and evaluation. A model quantized to INT8 should be 4 $\times$ smaller and roughly 3 $\times$ faster, but does it actually achieve those gains on the target hardware? Theoretical compression ratios and measured deployment improvements often diverge, sometimes dramatically, because real speedups depend on memory hierarchy effects, kernel implementations, and hardware utilization patterns that theory alone cannot predict. Translating theoretical compression ratios into measurable deployment improvements therefore requires systematic profiling and evaluation.
@@ -6566,7 +6668,6 @@ With these comprehensive baselines in place, the measurement framework must trac
Rigorous measurement tells practitioners *whether* their optimizations succeeded, but the measurements themselves require tooling to perform. Profiling, quantization, pruning, and deployment all depend on software frameworks that automate otherwise prohibitively complex workflows. We turn now to the implementation tools that make these techniques practical. Rigorous measurement tells practitioners *whether* their optimizations succeeded, but the measurements themselves require tooling to perform. Profiling, quantization, pruning, and deployment all depend on software frameworks that automate otherwise prohibitively complex workflows. We turn now to the implementation tools that make these techniques practical.
## Implementation Tools {#sec-model-compression-implementation-tools-4990} ## Implementation Tools {#sec-model-compression-implementation-tools-4990}
Understanding optimization techniques is necessary but not sufficient; practical implementation relies on robust software support. Without framework tooling, quantization would require manual modification of model definitions and careful insertion of quantization operations throughout the network, while pruning would demand direct manipulation of weight tensors. Both become prohibitively complex as models scale. Understanding optimization techniques is necessary but not sufficient; practical implementation relies on robust software support. Without framework tooling, quantization would require manual modification of model definitions and careful insertion of quantization operations throughout the network, while pruning would demand direct manipulation of weight tensors. Both become prohibitively complex as models scale.
@@ -6655,7 +6756,6 @@ Sparsity heat maps show sparsity distribution across layers (@fig-sparse-heat-ma
With the implementation tools and visualization capabilities established, the natural question is: how do these techniques compare when a practitioner must choose among them? Each optimization approach carries distinct trade-offs in accuracy, training cost, and hardware requirements, and a structured comparison clarifies which to reach for first. With the implementation tools and visualization capabilities established, the natural question is: how do these techniques compare when a practitioner must choose among them? Each optimization approach carries distinct trade-offs in accuracy, training cost, and hardware requirements, and a structured comparison clarifies which to reach for first.
## Technique Comparison {#sec-model-compression-technique-comparison-3142} ## Technique Comparison {#sec-model-compression-technique-comparison-3142}
A comparative analysis across the three major approaches reveals how each addresses distinct aspects of the efficiency-accuracy trade-off. Pruning works best when sparse computation hardware is available and when reducing floating-point operations is critical. Quantization provides the most versatile approach with broad hardware support, making it ideal for diverse deployment scenarios. Knowledge distillation requires significant computational investment but produces consistently high-quality compressed models, making it the right choice when accuracy preservation is paramount. @tbl-optimization-comparison summarizes these trade-offs for systematic technique selection. A comparative analysis across the three major approaches reveals how each addresses distinct aspects of the efficiency-accuracy trade-off. Pruning works best when sparse computation hardware is available and when reducing floating-point operations is critical. Quantization provides the most versatile approach with broad hardware support, making it ideal for diverse deployment scenarios. Knowledge distillation requires significant computational investment but produces consistently high-quality compressed models, making it the right choice when accuracy preservation is paramount. @tbl-optimization-comparison summarizes these trade-offs for systematic technique selection.
@@ -6673,7 +6773,6 @@ These techniques combine synergistically, with quantization often applied after
With the complete optimization toolkit now surveyed—from individual techniques through combination strategies—the most instructive lessons often come not from what works but from what fails. The following fallacies and pitfalls capture the most common mistakes engineers make when applying these techniques, each grounded in the quantitative trade-offs we have established throughout the chapter. With the complete optimization toolkit now surveyed—from individual techniques through combination strategies—the most instructive lessons often come not from what works but from what fails. The following fallacies and pitfalls capture the most common mistakes engineers make when applying these techniques, each grounded in the quantitative trade-offs we have established throughout the chapter.
## Fallacies and Pitfalls {#sec-model-compression-fallacies-pitfalls-1b5e} ## Fallacies and Pitfalls {#sec-model-compression-fallacies-pitfalls-1b5e}
```{python} ```{python}
@@ -6773,7 +6872,6 @@ Teams apply post-training quantization (PTQ) to avoid retraining and achieve 96.
Teams achieve `{python} int8_size_reduction_str` $\times$ model size reduction through INT8 quantization and expect `{python} int8_size_reduction_str` $\times$ memory savings in deployment. In practice, runtime overhead erodes compression gains. Dequantization kernels add `{python} dequant_overhead_str`% latency overhead converting INT8 weights back to FP16. Pruned models with irregular sparsity achieve only 12% latency reduction despite `{python} param_removal_str`% parameter removal because hardware cannot skip zeroed weights efficiently. As @sec-model-compression-profiling-opportunity-analysis-477f demonstrates, a BERT model pruned to 50% sparsity and quantized to INT8 achieves `{python} actual_speedup_str`% end-to-end speedup rather than the expected `{python} expected_speedup_str` $\times$, because unstructured sparsity creates irregular memory access. Production workflows must profile *deployed* latency on target hardware, not extrapolate from compression ratios. Teams achieve `{python} int8_size_reduction_str` $\times$ model size reduction through INT8 quantization and expect `{python} int8_size_reduction_str` $\times$ memory savings in deployment. In practice, runtime overhead erodes compression gains. Dequantization kernels add `{python} dequant_overhead_str`% latency overhead converting INT8 weights back to FP16. Pruned models with irregular sparsity achieve only 12% latency reduction despite `{python} param_removal_str`% parameter removal because hardware cannot skip zeroed weights efficiently. As @sec-model-compression-profiling-opportunity-analysis-477f demonstrates, a BERT model pruned to 50% sparsity and quantized to INT8 achieves `{python} actual_speedup_str`% end-to-end speedup rather than the expected `{python} expected_speedup_str` $\times$, because unstructured sparsity creates irregular memory access. Production workflows must profile *deployed* latency on target hardware, not extrapolate from compression ratios.
## Summary {#sec-model-compression-summary-8229} ## Summary {#sec-model-compression-summary-8229}
Model compression is not a bag of tricks but an engineering discipline built on three complementary dimensions: *structural optimization* determines what the model computes, *precision optimization* determines how precisely it computes, and *architectural optimization* determines how efficiently those computations execute on physical hardware. The most important lesson of this chapter is that these dimensions compose multiplicatively. Pruning alone might achieve 2 $\times$ compression; quantization alone might achieve 4 $\times$; but pruning, distillation, and quantization applied together can achieve 16 $\times$ — as BERT's compression from 440 MB to 28 MB demonstrates. The second lesson is equally important: theoretical compression ratios lie. A 4 $\times$ reduction in parameters translates to 4 $\times$ latency improvement only when the optimization aligns with the hardware's execution model. Unstructured sparsity on hardware that lacks sparse kernels achieves almost nothing; INT8 quantization on hardware without INT8 units achieves even less. Profile on target hardware, not paper metrics. Model compression is not a bag of tricks but an engineering discipline built on three complementary dimensions: *structural optimization* determines what the model computes, *precision optimization* determines how precisely it computes, and *architectural optimization* determines how efficiently those computations execute on physical hardware. The most important lesson of this chapter is that these dimensions compose multiplicatively. Pruning alone might achieve 2 $\times$ compression; quantization alone might achieve 4 $\times$; but pruning, distillation, and quantization applied together can achieve 16 $\times$ — as BERT's compression from 440 MB to 28 MB demonstrates. The second lesson is equally important: theoretical compression ratios lie. A 4 $\times$ reduction in parameters translates to 4 $\times$ latency improvement only when the optimization aligns with the hardware's execution model. Unstructured sparsity on hardware that lacks sparse kernels achieves almost nothing; INT8 quantization on hardware without INT8 units achieves even less. Profile on target hardware, not paper metrics.

View File

@@ -21,6 +21,26 @@ When training throughput is low, check MFU, communication fraction, and goodput
```{python} ```{python}
#| label: appendix-c3-setup #| label: appendix-c3-setup
#| echo: false #| echo: false
# ┌─────────────────────────────────────────────────────────────────────────────
# │ C³ TAXONOMY — MASTER COMPUTATION
# ├─────────────────────────────────────────────────────────────────────────────
# │ Context: PERSISTENT — All values used throughout the C³ Taxonomy appendix:
# │ @tbl-c3-dam-mapping, @tbl-c3-diagnostic-summary, @tbl-c3-traffic-light,
# │ @tbl-c3-bottleneck-actions, three case studies, scorecard, and exercises.
# │
# │ Goal: Provide all C³ diagnostic constants — case study parameters, effective
# │ FLOPS decomposition, and threshold strings — for the fleet-scale
# │ bottleneck classification reference appendix.
# │ Show: See individual section prose for formatted values. This cell provides
# │ the physics; string attributes are display-ready.
# │ How: calc_effective_flops() with MFU, scaling efficiency, and goodput ratio;
# │ all results as raw floats extracted via .m_as() or .magnitude where unitless.
# │
# │ Imports: mlsys.constants (H100_FLOPS_FP16_TENSOR, MFU_*, SCALING_EFF_*, OVERHEAD_*, …)
# │ mlsys.formulas (calc_effective_flops)
# │ mlsys.formatting (fmt, check, md_math)
# │ Exports: C3 = C3Taxonomy (accessed as C3.attribute in downstream cells)
# └─────────────────────────────────────────────────────────────────────────────
import math import math
from mlsys.constants import ( from mlsys.constants import (
@@ -35,15 +55,6 @@ from mlsys.constants import (
from mlsys.formatting import fmt, check, md_math from mlsys.formatting import fmt, check, md_math
from mlsys.formulas import calc_effective_flops from mlsys.formulas import calc_effective_flops
# =============================================================================
# PURPOSE
# =============================================================================
# Purpose: Compute all values for the C³ Taxonomy appendix.
# Used in: Case studies, effective FLOPS, scorecard, and inline prose.
#
# Philosophy: C³ parallels D·A·M — three MECE axes for fleet-scale diagnosis.
# Every computed value traces back to constants.py.
class C3Taxonomy: class C3Taxonomy:
"""Namespace for C³ diagnostic examples.""" """Namespace for C³ diagnostic examples."""
@@ -71,7 +82,7 @@ class C3Taxonomy:
case3_oh_maintenance_pct = OVERHEAD_MAINTENANCE * 100 case3_oh_maintenance_pct = OVERHEAD_MAINTENANCE * 100
# Effective FLOPS calculation: 100K GPU cluster # Effective FLOPS calculation: 100K GPU cluster
h100_tflops = H100_FLOPS_FP16_TENSOR.to(TFLOPs / second).magnitude h100_tflops = H100_FLOPS_FP16_TENSOR.m_as(TFLOPs / second)
n_gpus_eff = 100_000 n_gpus_eff = 100_000
peak_pflops = n_gpus_eff * h100_tflops / 1000 # PFLOPs peak_pflops = n_gpus_eff * h100_tflops / 1000 # PFLOPs
goodput_all = 1.0 - (OVERHEAD_PIPELINE_BUBBLE + goodput_all = 1.0 - (OVERHEAD_PIPELINE_BUBBLE +
@@ -80,7 +91,7 @@ class C3Taxonomy:
OVERHEAD_MAINTENANCE) OVERHEAD_MAINTENANCE)
effective_pflops = calc_effective_flops( effective_pflops = calc_effective_flops(
peak_pflops, MFU_TRAINING_HIGH, SCALING_EFF_8192GPU, goodput_all peak_pflops, MFU_TRAINING_HIGH, SCALING_EFF_8192GPU, goodput_all
) ).magnitude # extract float; calc_effective_flops returns Quantity since formulas.py upgrade
c3_tax = peak_pflops / effective_pflops c3_tax = peak_pflops / effective_pflops
eff_fraction = effective_pflops / peak_pflops eff_fraction = effective_pflops / peak_pflops
@@ -445,12 +456,8 @@ The gap between scaling-law predictions and observed training outcomes is, in la
```{python} ```{python}
#| label: appendix-c3-effective-flops #| label: appendix-c3-effective-flops
#| echo: false #| echo: false
# Goal: Alias C3Taxonomy strings for the 100K-GPU effective FLOPS callout prose.
# ============================================================================= # Exports: peak_str, eff_str, eff_pct_str, c3_tax_str, mfu_str, scaling_str, goodput_str
# PURPOSE
# =============================================================================
# Purpose: Format effective FLOPS values for the worked example.
# Used in: Effective FLOPS worked example prose.
peak_str = C3.peak_pflops_str peak_str = C3.peak_pflops_str
eff_str = C3.effective_pflops_str eff_str = C3.effective_pflops_str

View File

@@ -15,6 +15,23 @@ This appendix collects the reference numbers and compact models for fleet-scale
```{python} ```{python}
#| label: appendix-fleet-setup #| label: appendix-fleet-setup
#| echo: false #| echo: false
# ┌─────────────────────────────────────────────────────────────────────────────
# │ FLEET FOUNDATIONS — MASTER COMPUTATION
# ├─────────────────────────────────────────────────────────────────────────────
# │ Context: PERSISTENT — All values used throughout the Fleet Foundations
# │ appendix: hardware reference table, MTBF tables, checkpoint sizing,
# │ effective FLOPS, comm-compute ratio, and all prose inline values.
# │
# │ Goal: Provide all quantitative fleet engineering constants in one place
# │ for the "Numbers Every Fleet Engineer Should Know" reference appendix.
# │ Show: See individual section cells for formatted values. This cell provides
# │ the physics; formatting cells convert to display strings.
# │ How: pint Quantities from mlsys.constants; fleet formulas from formulas.py;
# │ all results as typed Quantities or raw floats via .m_as().
# │
# │ Imports: mlsys.constants (*), mlsys.formulas (calc_*), mlsys.formatting (fmt, check)
# │ Exports: FF = FleetFoundations (accessed as FF.attribute in downstream cells)
# └─────────────────────────────────────────────────────────────────────────────
import math import math
from mlsys.constants import * from mlsys.constants import *
@@ -26,27 +43,13 @@ from mlsys.formulas import (
calc_young_daly_interval, calc_checkpoint_size calc_young_daly_interval, calc_checkpoint_size
) )
# =============================================================================
# PURPOSE
# =============================================================================
# Purpose: Compute all values for the Fleet Foundations appendix.
# Used in: Reference tables, worked examples, and inline prose throughout.
#
# Philosophy: Fleet-scale numbers emphasize RATIOS between tiers and
# SCALING BEHAVIOR with cluster size. Absolute values are
# current-generation snapshots; ratios persist across generations.
# =============================================================================
# NETWORK HIERARCHY
# =============================================================================
class FleetFoundations: class FleetFoundations:
"""Namespace for fleet-scale reference calculations.""" """Namespace for fleet-scale reference calculations."""
# ── Communication Numbers ──────────────────────────────────────────────── # ── Communication Numbers ────────────────────────────────────────────────
# Bandwidth hierarchy (GB/s) # Bandwidth hierarchy (GB/s)
nvlink_h100_bw = int(NVLINK_H100_BW.to(GB / second).magnitude) nvlink_h100_bw = int(NVLINK_H100_BW.m_as(GB / second))
pcie5_bw = int(PCIE_GEN5_BW.to(GB / second).magnitude) pcie5_bw = int(PCIE_GEN5_BW.m_as(GB / second))
ib_ndr_bw = INFINIBAND_NDR_BW_GBS ib_ndr_bw = INFINIBAND_NDR_BW_GBS
ib_hdr_bw = INFINIBAND_HDR_BW_GBS ib_hdr_bw = INFINIBAND_HDR_BW_GBS
ib_xdr_bw = INFINIBAND_XDR_BW_GBS ib_xdr_bw = INFINIBAND_XDR_BW_GBS
@@ -95,28 +98,29 @@ class FleetFoundations:
mtbf_100k_h = calc_mtbf_cluster(GPU_MTTF_HOURS, cl_mega) mtbf_100k_h = calc_mtbf_cluster(GPU_MTTF_HOURS, cl_mega)
# Convert to minutes for readability # Convert to minutes for readability
mtbf_256_min = mtbf_256_h * 60 mtbf_256_min = mtbf_256_h.m_as(ureg.minute)
mtbf_2048_min = mtbf_2048_h * 60 mtbf_2048_min = mtbf_2048_h.m_as(ureg.minute)
mtbf_8192_min = mtbf_8192_h * 60 mtbf_8192_min = mtbf_8192_h.m_as(ureg.minute)
mtbf_100k_min = mtbf_100k_h * 60 mtbf_100k_min = mtbf_100k_h.m_as(ureg.minute)
# Failure probability for a 24-hour job (using hours consistently) # Failure probability for a 24-hour job
pfail_256_24h = calc_failure_probability(mtbf_256_h, 24) _24h = 24 * ureg.hour
pfail_2048_24h = calc_failure_probability(mtbf_2048_h, 24) pfail_256_24h = calc_failure_probability(mtbf_256_h, _24h)
pfail_8192_24h = calc_failure_probability(mtbf_8192_h, 24) pfail_2048_24h = calc_failure_probability(mtbf_2048_h, _24h)
pfail_100k_24h = calc_failure_probability(mtbf_100k_h, 24) pfail_8192_24h = calc_failure_probability(mtbf_8192_h, _24h)
pfail_100k_24h = calc_failure_probability(mtbf_100k_h, _24h)
# Checkpoint sizes (bytes) # Checkpoint sizes
ckpt_7b = calc_checkpoint_size(7e9) ckpt_7b = calc_checkpoint_size(7e9) # Quantity[byte]
ckpt_70b = calc_checkpoint_size(70e9) ckpt_70b = calc_checkpoint_size(70e9)
ckpt_175b = calc_checkpoint_size(175e9) ckpt_175b = calc_checkpoint_size(175e9)
ckpt_1t = calc_checkpoint_size(1e12) ckpt_1t = calc_checkpoint_size(1e12)
# Convert to GB # Extract in GB/TB
ckpt_7b_gb = ckpt_7b / 1e9 ckpt_7b_gb = ckpt_7b.m_as(GB)
ckpt_70b_gb = ckpt_70b / 1e9 ckpt_70b_gb = ckpt_70b.m_as(GB)
ckpt_175b_gb = ckpt_175b / 1e9 ckpt_175b_gb = ckpt_175b.m_as(GB)
ckpt_1t_tb = ckpt_1t / 1e12 ckpt_1t_tb = ckpt_1t.m_as(TB)
# Overhead budgets # Overhead budgets
oh_pipeline = int(OVERHEAD_PIPELINE_BUBBLE * 100) oh_pipeline = int(OVERHEAD_PIPELINE_BUBBLE * 100)
@@ -125,20 +129,20 @@ class FleetFoundations:
oh_maintenance = int(OVERHEAD_MAINTENANCE * 100) oh_maintenance = int(OVERHEAD_MAINTENANCE * 100)
# ── Hardware Reference ─────────────────────────────────────────────────── # ── Hardware Reference ───────────────────────────────────────────────────
h100_flops = int(H100_FLOPS_FP16_TENSOR.to(TFLOPs / second).magnitude) h100_flops = int(H100_FLOPS_FP16_TENSOR.m_as(TFLOPs / second))
h100_bw_tbs = f"{H100_MEM_BW.to(TB / second).magnitude:.2f}" h100_bw_tbs = f"{H100_MEM_BW.m_as(TB / second):.2f}"
h100_cap = int(H100_MEM_CAPACITY.to(GiB).magnitude) h100_cap = int(H100_MEM_CAPACITY.m_as(GiB))
h100_tdp = int(H100_TDP.magnitude) h100_tdp = int(H100_TDP.m_as(watt))
b200_flops = int(B200_FLOPS_FP16_TENSOR.to(TFLOPs / second).magnitude) b200_flops = int(B200_FLOPS_FP16_TENSOR.m_as(TFLOPs / second))
b200_bw_tbs = f"{B200_MEM_BW.to(TB / second).magnitude:.0f}" b200_bw_tbs = f"{B200_MEM_BW.m_as(TB / second):.0f}"
b200_cap = int(B200_MEM_CAPACITY.to(GiB).magnitude) b200_cap = int(B200_MEM_CAPACITY.m_as(GiB))
b200_tdp = int(B200_TDP.magnitude) b200_tdp = int(B200_TDP.m_as(watt))
tpuv5_flops = int(TPUV5P_FLOPS_BF16.to(TFLOPs / second).magnitude) tpuv5_flops = int(TPUV5P_FLOPS_BF16.m_as(TFLOPs / second))
tpuv5_bw_tbs = f"{TPUV5P_MEM_BW.to(TB / second).magnitude:.2f}" tpuv5_bw_tbs = f"{TPUV5P_MEM_BW.m_as(TB / second):.2f}"
tpuv5_cap = int(TPUV5P_MEM_CAPACITY.to(GiB).magnitude) tpuv5_cap = int(TPUV5P_MEM_CAPACITY.m_as(GiB))
tpuv5_ici = int(TPUV5P_ICI_BW.to(GB / second).magnitude) tpuv5_ici = int(TPUV5P_ICI_BW.m_as(GB / second))
# ── Power and Sustainability ───────────────────────────────────────────── # ── Power and Sustainability ─────────────────────────────────────────────
rack_trad = RACK_POWER_TRADITIONAL_KW rack_trad = RACK_POWER_TRADITIONAL_KW
@@ -154,17 +158,19 @@ class FleetFoundations:
# ── Effective FLOPS Example ────────────────────────────────────────────── # ── Effective FLOPS Example ──────────────────────────────────────────────
# 1024-GPU cluster, H100, realistic overheads # 1024-GPU cluster, H100, realistic overheads
peak_1024 = 1024 * H100_FLOPS_FP16_TENSOR.to(TFLOPs / second).magnitude _peak_1024_qty = 1024 * H100_FLOPS_FP16_TENSOR # Quantity[TFLOPs/s]
peak_1024 = _peak_1024_qty.m_as(TFLOPs / second) # raw float for display
goodput_ratio = 1.0 - (OVERHEAD_PIPELINE_BUBBLE + goodput_ratio = 1.0 - (OVERHEAD_PIPELINE_BUBBLE +
OVERHEAD_CHECKPOINT + OVERHEAD_CHECKPOINT +
OVERHEAD_FAILURE_RECOVERY + OVERHEAD_FAILURE_RECOVERY +
OVERHEAD_MAINTENANCE) OVERHEAD_MAINTENANCE)
eff_flops_1024 = calc_effective_flops( _eff_flops_1024_qty = calc_effective_flops(
peak_1024, _peak_1024_qty,
MFU_TRAINING_HIGH, MFU_TRAINING_HIGH,
SCALING_EFF_1024GPU, SCALING_EFF_1024GPU,
goodput_ratio goodput_ratio
) ) # Quantity[flop/second]
eff_flops_1024 = _eff_flops_1024_qty.m_as(TFLOPs / second) # raw float for display
eff_fraction = eff_flops_1024 / peak_1024 eff_fraction = eff_flops_1024 / peak_1024
# ── Invariant Checks ───────────────────────────────────────────────────── # ── Invariant Checks ─────────────────────────────────────────────────────
@@ -289,12 +295,8 @@ Communication defines the boundaries of parallelism. These tables quantify the b
```{python} ```{python}
#| label: fleet-comm-numbers #| label: fleet-comm-numbers
#| echo: false #| echo: false
# Goal: Format communication bandwidth and latency strings for @tbl-fleet-bandwidth-hierarchy and @tbl-fleet-latency-hierarchy.
# ============================================================================= # Exports: nvlink_bw_str, pcie5_bw_str, ib_*_str, tpuv5_ici_str, nvlink_to_ib_str, *_lat_str
# PURPOSE
# =============================================================================
# Purpose: Compute communication hierarchy values for inline references.
# Used in: Communication numbers tables and prose.
# ── Bandwidth ratios ──────────────────────────────────────────────────────── # ── Bandwidth ratios ────────────────────────────────────────────────────────
nvlink_bw_str = fmt(FF.nvlink_h100_bw, precision=0) nvlink_bw_str = fmt(FF.nvlink_h100_bw, precision=0)
@@ -386,12 +388,8 @@ At fleet scale, coordination---failure recovery, checkpointing, and maintenance-
```{python} ```{python}
#| label: fleet-mtbf-table #| label: fleet-mtbf-table
#| echo: false #| echo: false
# Goal: Format MTBF hours, minutes, and P(failure) percentages for @tbl-fleet-mtbf.
# ============================================================================= # Exports: mtbf_256_str, mtbf_2048_str, mtbf_8192_str, mtbf_100k_str, mtbf_*_min_str, pfail_*_str
# PURPOSE
# =============================================================================
# Purpose: Format MTBF and failure probability values for the table.
# Used in: MTBF by cluster size table.
mtbf_256_str = fmt(FF.mtbf_256_h, precision=1, commas=False) mtbf_256_str = fmt(FF.mtbf_256_h, precision=1, commas=False)
mtbf_2048_str = fmt(FF.mtbf_2048_h, precision=1, commas=False) mtbf_2048_str = fmt(FF.mtbf_2048_h, precision=1, commas=False)
@@ -432,12 +430,8 @@ Checkpointing is the primary recovery mechanism, and its cost depends on the mod
```{python} ```{python}
#| label: fleet-checkpoint-sizes #| label: fleet-checkpoint-sizes
#| echo: false #| echo: false
# Goal: Format checkpoint sizes in GB/TB for @tbl-fleet-checkpoint-sizes.
# ============================================================================= # Exports: ckpt_7b_str, ckpt_70b_str, ckpt_175b_str, ckpt_1t_str
# PURPOSE
# =============================================================================
# Purpose: Format checkpoint sizes for the reference table.
# Used in: Checkpoint size table.
ckpt_7b_str = fmt(FF.ckpt_7b_gb, precision=0) ckpt_7b_str = fmt(FF.ckpt_7b_gb, precision=0)
ckpt_70b_str = fmt(FF.ckpt_70b_gb, precision=0) ckpt_70b_str = fmt(FF.ckpt_70b_gb, precision=0)
@@ -484,12 +478,8 @@ These numbers reflect the current generation of fleet-scale hardware. Use them f
```{python} ```{python}
#| label: fleet-hardware-ref #| label: fleet-hardware-ref
#| echo: false #| echo: false
# Goal: Format H100, B200, and TPU v5p specs for @tbl-fleet-hardware-ref.
# ============================================================================= # Exports: h100_flops_str, h100_bw_str, h100_cap_str, h100_tdp_str, b200_*, tpuv5_*
# PURPOSE
# =============================================================================
# Purpose: Format hardware reference values for the comparison table.
# Used in: Current hardware reference table.
h100_flops_str = fmt(FF.h100_flops, precision=0) h100_flops_str = fmt(FF.h100_flops, precision=0)
h100_bw_str = FF.h100_bw_tbs h100_bw_str = FF.h100_bw_tbs
@@ -547,36 +537,52 @@ Volume I introduced Amdahl's Law for a single machine, where the serial fraction
```{python} ```{python}
#| label: fleet-amdahl-example #| label: fleet-amdahl-example
#| echo: false #| echo: false
# ┌─────────────────────────────────────────────────────────────────────────────
# │ FLEET AMDAHL EXAMPLE
# ├─────────────────────────────────────────────────────────────────────────────
# │ Context: @sec-fleet-foundations-amdahls-fleet worked example
# │
# │ Goal: Compute Amdahl's Law speedup at 32/256/1024/8192 GPUs for 10% serial fraction.
# │ Show: Speedup values and the Amdahl ceiling for inline prose.
# │ How: calc_amdahls_speedup() from formulas.py; check() for invariants.
# │
# │ Imports: mlsys.formulas (calc_amdahls_speedup), mlsys.formatting (fmt, check)
# │ Exports: s_fleet_pct_str, max_speedup_str, su_32_str, su_256_str, su_1024_str, su_8192_str
# └─────────────────────────────────────────────────────────────────────────────
# ============================================================================= class FleetAmdahlExample:
# PURPOSE """Amdahl's Law speedup at 32/256/1024/8192 GPUs for 10% serial fraction."""
# =============================================================================
# Purpose: Compute Amdahl's Law examples at fleet scale.
# Used in: Amdahl's Law at Fleet Scale worked example.
# ── PARAMETERS ────────────────────────────────────────────────────────────── # ── 1. PARAMETERS (Inputs) ──────────────────────────────────────────────
s_fleet = 0.10 # 10% serial fraction (communication + sync) s_fleet = 0.10
n_values = [32, 256, 1024, 8192] n_values = [32, 256, 1024, 8192]
# ── CALCULATION ───────────────────────────────────────────────────────────── # ── 2. CALCULATION (The Physics) ────────────────────────────────────────
speedups = {} speedups = {}
for n in n_values: for _n in n_values:
su = calc_amdahls_speedup(1 - s_fleet, n) speedups[_n] = calc_amdahls_speedup(1 - s_fleet, _n)
speedups[n] = su
max_speedup = 1 / s_fleet max_speedup = 1 / s_fleet
# ── INVARIANTS ────────────────────────────────────────────────────────────── # ── 3. INVARIANTS (Guardrails) ──────────────────────────────────────────
check(speedups[8192] < max_speedup, "Speedup must be below Amdahl limit") check(speedups[8192] < max_speedup, "Speedup must be below Amdahl limit")
check(max_speedup == 10.0, "Max speedup for 10% serial fraction must be 10x") check(max_speedup == 10.0, "Max speedup for 10% serial fraction must be 10x")
# ── OUTPUTS ───────────────────────────────────────────────────────────────── # ── 4. OUTPUTS (Formatting) ─────────────────────────────────────────────
s_fleet_pct_str = "10" s_fleet_pct_str = "10"
max_speedup_str = fmt(max_speedup, precision=0, commas=False) max_speedup_str = fmt(max_speedup, precision=0, commas=False)
su_32_str = fmt(speedups[32], precision=1, commas=False) su_32_str = fmt(speedups[32], precision=1, commas=False)
su_256_str = fmt(speedups[256], precision=1, commas=False) su_256_str = fmt(speedups[256], precision=1, commas=False)
su_1024_str = fmt(speedups[1024], precision=1, commas=False) su_1024_str = fmt(speedups[1024], precision=1, commas=False)
su_8192_str = fmt(speedups[8192], precision=1, commas=False) su_8192_str = fmt(speedups[8192], precision=1, commas=False)
# ┌── EXPORTS (Bridge to Text) ─────────────────────────────────────────────────
s_fleet_pct_str = FleetAmdahlExample.s_fleet_pct_str
max_speedup_str = FleetAmdahlExample.max_speedup_str
su_32_str = FleetAmdahlExample.su_32_str
su_256_str = FleetAmdahlExample.su_256_str
su_1024_str = FleetAmdahlExample.su_1024_str
su_8192_str = FleetAmdahlExample.su_8192_str
``` ```
To see the fleet-scale implications, consider a training workload where `{python} s_fleet_pct_str`% of wall-clock time is spent in synchronization, communication, and other serial overhead. Amdahl's Law gives the following speedups: To see the fleet-scale implications, consider a training workload where `{python} s_fleet_pct_str`% of wall-clock time is spent in synchronization, communication, and other serial overhead. Amdahl's Law gives the following speedups:
@@ -604,58 +610,72 @@ When $\rho < 1$, computation dominates and communication can be overlapped. When
```{python} ```{python}
#| label: fleet-comm-comp-ratio #| label: fleet-comm-comp-ratio
#| echo: false #| echo: false
# ┌─────────────────────────────────────────────────────────────────────────────
# │ FLEET COMM-COMPUTE RATIO
# ├─────────────────────────────────────────────────────────────────────────────
# │ Context: @sec-fleet-foundations-comm-compute-ratio worked example (@tbl-fleet-comm-comp)
# │
# │ Goal: Compute ρ = T_comm / T_comp for 3 scenarios: 7B DP, 350M DP, tensor-parallel.
# │ Show: AllReduce times in ms and ρ ratios for each scenario; ~0.1 for DP 7B, ~3 for DP 350M.
# │ How: calc_ring_allreduce_time() with IB NDR params; NVLink BW for tensor-parallel.
# │
# │ Imports: mlsys.constants (INFINIBAND_NDR_BW_GBS, IB_NDR_LATENCY_US, NVLINK_H100_BW, GB, second)
# │ Exports: ar_7b_ms_str, rho_7b_str, ar_350m_ms_str, rho_350m_str, rho_tp_str
# └─────────────────────────────────────────────────────────────────────────────
# ============================================================================= class FleetCommCompRatio:
# PURPOSE """Communication-to-computation ratio ρ for three parallelism scenarios."""
# =============================================================================
# Purpose: Compute communication-computation ratios for different scenarios.
# Used in: Communication-computation ratio worked example.
# ── SCENARIO 1: Data parallelism, large model ────────────────────────────── # ── SCENARIO 1: Data parallelism, large model ──────────────────────────
# 7B model, 256 GPUs, IB NDR # 7B model, 256 GPUs, IB NDR
grad_bytes_7b = 7e9 * 2 # 7B params * 2 bytes (BF16 gradients) grad_bytes_7b = 7e9 * 2 # 7B params * 2 bytes (BF16 gradients)
allreduce_time_7b = calc_ring_allreduce_time( allreduce_time_7b = calc_ring_allreduce_time(
message_bytes=grad_bytes_7b, message_bytes=grad_bytes_7b,
n_gpus=256, n_gpus=256,
bandwidth_bytes_s=INFINIBAND_NDR_BW_GBS * 1e9, bandwidth_bytes_s=INFINIBAND_NDR_BW_GBS * 1e9,
latency_s=IB_NDR_LATENCY_US * 1e-6 latency_s=IB_NDR_LATENCY_US * 1e-6
) ) # Quantity[second]
# Computation time: assume ~50ms forward+backward per step comp_time_7b = 0.050 # 50 ms (seconds)
comp_time_7b = 0.050 # 50 ms rho_7b = allreduce_time_7b.m_as(ureg.second) / comp_time_7b
rho_7b = allreduce_time_7b / comp_time_7b
# ── SCENARIO 2: Data parallelism, small model ────────────────────────────── # ── SCENARIO 2: Data parallelism, small model ──────────────────────────
# 350M model, 256 GPUs, IB NDR # 350M model, 256 GPUs, IB NDR
grad_bytes_350m = 350e6 * 2 # 350M params * 2 bytes grad_bytes_350m = 350e6 * 2 # 350M params * 2 bytes
allreduce_time_350m = calc_ring_allreduce_time( allreduce_time_350m = calc_ring_allreduce_time(
message_bytes=grad_bytes_350m, message_bytes=grad_bytes_350m,
n_gpus=256, n_gpus=256,
bandwidth_bytes_s=INFINIBAND_NDR_BW_GBS * 1e9, bandwidth_bytes_s=INFINIBAND_NDR_BW_GBS * 1e9,
latency_s=IB_NDR_LATENCY_US * 1e-6 latency_s=IB_NDR_LATENCY_US * 1e-6
) ) # Quantity[second]
comp_time_350m = 0.005 # 5 ms (smaller model) comp_time_350m = 0.005 # 5 ms (seconds, smaller model)
rho_350m = allreduce_time_350m / comp_time_350m rho_350m = allreduce_time_350m.m_as(ureg.second) / comp_time_350m
# ── SCENARIO 3: Tensor parallelism, within node ──────────────────────────── # ── SCENARIO 3: Tensor parallelism, within node ────────────────────────
# Activation transfer: 8 GPUs, NVLink, ~16 MB per layer # Activation transfer: 8 GPUs, NVLink, ~16 MB per layer
act_bytes = 16e6 # 16 MB act_bytes = 16e6 # 16 MB
act_transfer_time = act_bytes / (NVLINK_H100_BW.to(GB / second).magnitude * 1e9) act_transfer_time = act_bytes / (NVLINK_H100_BW.m_as(GB / second) * 1e9)
comp_time_layer = 0.001 # 1 ms per layer comp_time_layer = 0.001 # 1 ms per layer
rho_tp = act_transfer_time / comp_time_layer rho_tp = act_transfer_time / comp_time_layer
# ── INVARIANTS ────────────────────────────────────────────────────────────── # ── INVARIANTS ──────────────────────────────────────────────────────────
check(rho_7b > 0.1, "7B comm ratio must be non-trivial") check(rho_7b > 0.1, "7B comm ratio must be non-trivial")
check(rho_350m > 0.01, "350M comm ratio must be non-trivial") check(rho_350m > 0.01, "350M comm ratio must be non-trivial")
# ── OUTPUTS ───────────────────────────────────────────────────────────────── # ── OUTPUTS ─────────────────────────────────────────────────────────────
ar_7b_ms_str = fmt(allreduce_time_7b * 1000, precision=1, commas=False) ar_7b_ms_str = fmt(allreduce_time_7b.m_as(ureg.millisecond), precision=1, commas=False)
rho_7b_str = fmt(rho_7b, precision=2, commas=False) rho_7b_str = fmt(rho_7b, precision=2, commas=False)
ar_350m_ms_str = fmt(allreduce_time_350m.m_as(ureg.millisecond), precision=1, commas=False)
rho_350m_str = fmt(rho_350m, precision=1, commas=False)
rho_tp_str = fmt(rho_tp, precision=3, commas=False)
ar_350m_ms_str = fmt(allreduce_time_350m * 1000, precision=1, commas=False) # ┌── EXPORTS (Bridge to Text) ─────────────────────────────────────────────────
rho_350m_str = fmt(rho_350m, precision=1, commas=False) ar_7b_ms_str = FleetCommCompRatio.ar_7b_ms_str
rho_7b_str = FleetCommCompRatio.rho_7b_str
rho_tp_str = fmt(rho_tp, precision=3, commas=False) rho_7b = FleetCommCompRatio.rho_7b # raw float used in fmt() call in prose
ar_350m_ms_str = FleetCommCompRatio.ar_350m_ms_str
rho_350m_str = FleetCommCompRatio.rho_350m_str
rho_tp_str = FleetCommCompRatio.rho_tp_str
``` ```
@tbl-fleet-comm-comp shows the ratio for three representative scenarios. The contrast between them reveals why parallelism strategy must match the workload. @tbl-fleet-comm-comp shows the ratio for three representative scenarios. The contrast between them reveals why parallelism strategy must match the workload.
@@ -685,12 +705,8 @@ The key insight for fleet-scale ML is that weak scaling is not just a mathematic
```{python} ```{python}
#| label: fleet-effective-flops #| label: fleet-effective-flops
#| echo: false #| echo: false
# Goal: Format peak and effective FLOPS for the 1,024-GPU compound loss callout.
# ============================================================================= # Exports: peak_str, eff_str, eff_pct_str, goodput_pct_str, mfu_pct_str, scaling_pct_str
# PURPOSE
# =============================================================================
# Purpose: Compute effective FLOPS for the compound loss example.
# Used in: Effective FLOPS worked example.
peak_str = fmt(FF.peak_1024, precision=0) peak_str = fmt(FF.peak_1024, precision=0)
eff_str = fmt(FF.eff_flops_1024, precision=0) eff_str = fmt(FF.eff_flops_1024, precision=0)

View File

@@ -35,6 +35,28 @@ This appendix is designed as a *reference*. Use it when you need to move from in
```{python} ```{python}
#| label: appendix-reliability-setup #| label: appendix-reliability-setup
#| echo: false #| echo: false
# ┌─────────────────────────────────────────────────────────────────────────────
# │ RELIABILITY FOUNDATIONS — MASTER COMPUTATION
# ├─────────────────────────────────────────────────────────────────────────────
# │ Context: PERSISTENT — All values used throughout the Reliability Foundations
# │ appendix: @tbl-component-fit, @tbl-mtbf-cluster, @tbl-failure-prob,
# │ @tbl-checkpoint-size, @tbl-recovery-anatomy, @tbl-strategy-comparison,
# │ @tbl-availability-stacking, and all Young-Daly worked examples.
# │
# │ Goal: Provide all reliability constants — FIT rates, MTBF cascade, Young-Daly
# │ optimal checkpoint interval, recovery anatomy, and availability stacking —
# │ for the "Failure as a Physical Constraint" reference appendix.
# │ Show: See individual section cells for formatted values. This cell provides
# │ the physics; formatting cells and f-strings convert to display strings.
# │ How: pint Quantities from mlsys.constants; calc_mtbf_node, calc_mtbf_cluster,
# │ calc_young_daly_interval, calc_failure_probability, calc_checkpoint_size,
# │ calc_availability_stacked from formulas.py; all extractions via .m_as().
# │
# │ Imports: mlsys.constants (*), mlsys.formulas (calc_mtbf_*, calc_young_daly_interval,
# │ calc_failure_probability, calc_checkpoint_size, calc_availability_stacked)
# │ mlsys.formatting (fmt, check)
# │ Exports: R = ReliabilityFoundations (accessed as R.attribute in downstream cells)
# └─────────────────────────────────────────────────────────────────────────────
from mlsys.constants import * from mlsys.constants import *
from mlsys.formatting import fmt, check from mlsys.formatting import fmt, check
@@ -103,8 +125,9 @@ class ReliabilityFoundations:
@classmethod @classmethod
def p_failure(cls, n_gpus, duration_hours): def p_failure(cls, n_gpus, duration_hours):
mtbf_h = cls.cluster_mtbf(n_gpus) mtbf_h = cls.cluster_mtbf(n_gpus) # Quantity[hour]
return calc_failure_probability(mtbf_h, duration_hours) dur_h = duration_hours * ureg.hour # attach unit
return calc_failure_probability(mtbf_h, dur_h)
# ┌── 5. CHECKPOINT SIZING ──────────────────────────────────────── # ┌── 5. CHECKPOINT SIZING ────────────────────────────────────────
# Mixed-precision Adam: 16 bytes/param # Mixed-precision Adam: 16 bytes/param
@@ -114,25 +137,28 @@ class ReliabilityFoundations:
@classmethod @classmethod
def ckpt_size_gb(cls, n_params): def ckpt_size_gb(cls, n_params):
return calc_checkpoint_size(n_params, cls.bytes_per_param) / 1e9 return calc_checkpoint_size(n_params, cls.bytes_per_param).m_as(GB)
# ┌── 6. YOUNG-DALY (10K cluster, 175B model) ──────────────────── # ┌── 6. YOUNG-DALY (10K cluster, 175B model) ────────────────────
ckpt_175b_bytes = calc_checkpoint_size(175e9, 16) ckpt_175b_bytes = calc_checkpoint_size(175e9, 16) # Quantity[byte]
ckpt_175b_gb = ckpt_175b_bytes / 1e9 ckpt_175b_gb = ckpt_175b_bytes.m_as(GB) # raw float in GB
ckpt_write_bw = CHECKPOINT_WRITE_BW_GBS # GB/s ckpt_write_bw = CHECKPOINT_WRITE_BW_GBS # GB/s (raw float)
ckpt_write_time_s = ckpt_175b_gb / ckpt_write_bw ckpt_write_time_s = ckpt_175b_gb / ckpt_write_bw # raw float (seconds)
cluster_mtbf_10k_s = cluster_mtbf_10k * SEC_PER_HOUR cluster_mtbf_10k_s = cluster_mtbf_10k.m_as(ureg.second) # raw float (seconds)
tau_opt_s = calc_young_daly_interval(ckpt_write_time_s, cluster_mtbf_10k_s) tau_opt_s = calc_young_daly_interval(ckpt_write_time_s, cluster_mtbf_10k_s) # Quantity[second]
tau_opt_min = tau_opt_s / SECONDS_PER_MINUTE tau_opt_min = tau_opt_s.m_as(ureg.minute) # raw float in minutes
# ┌── 7. RECOVERY TIME ─────────────────────────────────────────── # ┌── 7. RECOVERY TIME ───────────────────────────────────────────
t_detect = HEARTBEAT_TIMEOUT_S t_detect = HEARTBEAT_TIMEOUT_S # raw float (seconds) — kept for table display
t_reschedule = RESCHEDULE_TIME_S t_reschedule = RESCHEDULE_TIME_S # raw float (seconds) — kept for table display
t_reload_s = ckpt_write_time_s # same BW, same size t_reload_s = ckpt_write_time_s # raw float (seconds)
# Replay: half the interval on average # Replay: half the interval on average
t_replay_s = tau_opt_s / 2 t_replay_s = tau_opt_s / 2 # Quantity[second]
t_recovery_total_s = t_detect + t_reschedule + t_reload_s + t_replay_s # Sum: attach units to raw seconds, then extract in minutes
t_recovery_total_s = (
(t_detect + t_reschedule + t_reload_s) * ureg.second + t_replay_s
).m_as(ureg.minute) # raw float in minutes
# ┌── 8. GOODPUT ───────────────────────────────────────────────── # ┌── 8. GOODPUT ─────────────────────────────────────────────────
overhead_ckpt = OVERHEAD_CHECKPOINT overhead_ckpt = OVERHEAD_CHECKPOINT
@@ -150,8 +176,8 @@ class ReliabilityFoundations:
R = ReliabilityFoundations # short alias for inline use R = ReliabilityFoundations # short alias for inline use
# ┌── INVARIANTS ────────────────────────────────────────────────────── # ┌── INVARIANTS ──────────────────────────────────────────────────────
check(R.cluster_mtbf_10k < 5.0, check(R.cluster_mtbf_10k.m_as(ureg.hour) < 5.0,
f"10K cluster MTBF should be < 5 hours, got {R.cluster_mtbf_10k:.2f}") f"10K cluster MTBF should be < 5 hours, got {R.cluster_mtbf_10k.m_as(ureg.hour):.2f}")
check(R.tau_opt_min > 5 and R.tau_opt_min < 60, check(R.tau_opt_min > 5 and R.tau_opt_min < 60,
f"Young-Daly interval should be 5-60 min, got {R.tau_opt_min:.1f}") f"Young-Daly interval should be 5-60 min, got {R.tau_opt_min:.1f}")
check(R.p_failure(10_000, 24) > 0.99, check(R.p_failure(10_000, 24) > 0.99,
@@ -159,12 +185,12 @@ check(R.p_failure(10_000, 24) > 0.99,
# ┌── FORMATTED OUTPUTS ────────────────────────────────────────────── # ┌── FORMATTED OUTPUTS ──────────────────────────────────────────────
gpu_mttf_str = fmt(R.gpu_mttf, precision=0) gpu_mttf_str = fmt(R.gpu_mttf, precision=0)
node_mtbf_str = fmt(R.node_mtbf, precision=0) node_mtbf_str = fmt(R.node_mtbf.m_as(ureg.hour), precision=0)
cluster_mtbf_10k_str = fmt(R.cluster_mtbf_10k, precision=2) cluster_mtbf_10k_str = fmt(R.cluster_mtbf_10k.m_as(ureg.hour), precision=2)
tau_opt_min_str = fmt(R.tau_opt_min, precision=1) tau_opt_min_str = fmt(R.tau_opt_min, precision=1)
ckpt_175b_gb_str = fmt(R.ckpt_175b_gb, precision=0) ckpt_175b_gb_str = fmt(R.ckpt_175b_gb, precision=0)
ckpt_write_time_str = fmt(R.ckpt_write_time_s, precision=1) ckpt_write_time_str = fmt(R.ckpt_write_time_s, precision=1)
t_recovery_str = fmt(R.t_recovery_total_s / SECONDS_PER_MINUTE, precision=1) t_recovery_str = fmt(R.t_recovery_total_s, precision=1)
``` ```
## Failure Probability at Scale {#sec-reliability-foundations-failure-probability} ## Failure Probability at Scale {#sec-reliability-foundations-failure-probability}
@@ -188,8 +214,8 @@ $$ \text{MTTF} = \frac{10^9}{\text{FIT}} $$ {#eq-mttf-from-fit}
```{python} ```{python}
#| label: component-fit-table #| label: component-fit-table
#| echo: false #| echo: false
# Goal: Format per-component MTTF in years for @tbl-component-fit.
# Format component data for the table # Exports: gpu_mttf_yr, hbm_mttf_yr, nic_mttf_yr, psu_mttf_yr, pcie_mttf_yr, cable_mttf_yr, tor_mttf_yr
gpu_mttf_yr = f"{R.gpu_mttf / HOURS_PER_YEAR:.1f}" gpu_mttf_yr = f"{R.gpu_mttf / HOURS_PER_YEAR:.1f}"
hbm_mttf_yr = f"{R.hbm_mttf / HOURS_PER_YEAR:.1f}" hbm_mttf_yr = f"{R.hbm_mttf / HOURS_PER_YEAR:.1f}"
nic_mttf_yr = f"{R.nic_mttf / HOURS_PER_YEAR:.1f}" nic_mttf_yr = f"{R.nic_mttf / HOURS_PER_YEAR:.1f}"
@@ -233,24 +259,24 @@ For a cluster of $N$ identical nodes, the same logic applies one level up:
$$ \text{MTBF}_\text{cluster} = \frac{\text{MTBF}_\text{node}}{N} $$ {#eq-mtbf-cluster} $$ \text{MTBF}_\text{cluster} = \frac{\text{MTBF}_\text{node}}{N} $$ {#eq-mtbf-cluster}
This is the **MTBF cascade**: reliability degrades linearly with component count at each level, and the levels compound. A node with `{python} f"{R.node_mtbf:,.0f}"`-hour MTBF sounds reliable. A cluster of `{python} f"{R.nodes_10k:,}"` such nodes has an MTBF of just `{python} cluster_mtbf_10k_str` hours---a failure every few hours is the expected steady state. This is the **MTBF cascade**: reliability degrades linearly with component count at each level, and the levels compound. A node with `{python} f"{R.node_mtbf.m_as(ureg.hour):,.0f}"`-hour MTBF sounds reliable. A cluster of `{python} f"{R.nodes_10k:,}"` such nodes has an MTBF of just `{python} cluster_mtbf_10k_str` hours---a failure every few hours is the expected steady state.
@tbl-mtbf-cluster shows how cluster MTBF shrinks as fleet size grows. @tbl-mtbf-cluster shows how cluster MTBF shrinks as fleet size grows.
```{python} ```{python}
#| label: mtbf-cluster-table #| label: mtbf-cluster-table
#| echo: false #| echo: false
# Goal: Build MTBF row data (hours or minutes, failures/day) for @tbl-mtbf-cluster.
# Build MTBF table data # Exports: mtbf_data list of dicts with "gpus", "nodes", "mtbf", "per_day" keys
mtbf_data = [] mtbf_data = []
for n_gpus in R.cluster_sizes: for n_gpus in R.cluster_sizes:
n_nodes = R.nodes_for_gpus(n_gpus) n_nodes = R.nodes_for_gpus(n_gpus)
mtbf_h = R.cluster_mtbf(n_gpus) mtbf_h_val = R.cluster_mtbf(n_gpus).m_as(ureg.hour) # raw float in hours
if mtbf_h >= 1.0: if mtbf_h_val >= 1.0:
mtbf_str = f"{mtbf_h:.1f} hours" mtbf_str = f"{mtbf_h_val:.1f} hours"
else: else:
mtbf_str = f"{mtbf_h * SECONDS_PER_MINUTE:.0f} minutes" mtbf_str = f"{mtbf_h_val * 60:.0f} minutes"
per_day = 24 / mtbf_h per_day = 24 / mtbf_h_val
mtbf_data.append({ mtbf_data.append({
"gpus": f"{n_gpus:,}", "gpus": f"{n_gpus:,}",
"nodes": f"{n_nodes:,}", "nodes": f"{n_nodes:,}",
@@ -292,8 +318,8 @@ When $T_\text{job} \gg \text{MTBF}$, this probability approaches 1 rapidly. @tbl
```{python} ```{python}
#| label: failure-probability-table #| label: failure-probability-table
#| echo: false #| echo: false
# Goal: Compute P(≥1 failure) matrix for @tbl-failure-prob across cluster sizes and job durations.
# Build failure probability matrix # Exports: fp_data dict keyed by n_gpus; values are [1-day, 1-week, 30-day] probability strings
dur_labels = ["1 Day", "1 Week", "30 Days"] dur_labels = ["1 Day", "1 Week", "30 Days"]
fp_data = {} fp_data = {}
for n_gpus in R.cluster_sizes: for n_gpus in R.cluster_sizes:
@@ -370,6 +396,8 @@ $$ \text{Checkpoint Size} = N_\text{params} \times 16 \text{ bytes/param} $$ {#e
```{python} ```{python}
#| label: checkpoint-sizing-table #| label: checkpoint-sizing-table
#| echo: false #| echo: false
# Goal: Format checkpoint sizes and write times for @tbl-checkpoint-size across 7B1T models.
# Exports: ckpt_data list of dicts with "label", "ckpt_gb", "write_time" keys
ckpt_data = [] ckpt_data = []
for i, n_params in enumerate(R.model_sizes_params): for i, n_params in enumerate(R.model_sizes_params):
@@ -407,28 +435,50 @@ At frontier scale (175B+ parameters), checkpoint sizes reach the terabyte range.
```{python} ```{python}
#| label: worked-example-young-daly #| label: worked-example-young-daly
#| echo: false #| echo: false
# ┌─────────────────────────────────────────────────────────────────────────────
# │ YOUNG-DALY WORKED EXAMPLE
# ├─────────────────────────────────────────────────────────────────────────────
# │ Context: @sec-reliability-foundations-worked-example callout
# │
# │ Goal: Compute optimal checkpoint interval τ_opt for 175B model on 10K-GPU cluster;
# │ show scaling to 20K GPUs.
# │ Show: ~28 min optimal interval, ~X% checkpoint overhead, shorter interval at 20K GPUs.
# │ How: calc_young_daly_interval(δ, MTBF_s) from R.ckpt_write_time_s and R.cluster_mtbf_10k_s.
# │
# │ Imports: mlsys.formulas (calc_young_daly_interval), mlsys.constants (GPUS_PER_HOST)
# │ Exports: yd_mtbf_h_str, yd_delta_str, yd_tau_min_str, yd_overhead_str, tau_20k_min_str
# └─────────────────────────────────────────────────────────────────────────────
# All values already computed in ReliabilityFoundations class WorkedExampleYoungDaly:
yd_mtbf_h = R.cluster_mtbf_10k """Young-Daly optimal checkpoint interval for 175B model on 10K-GPU cluster."""
yd_mtbf_s = R.cluster_mtbf_10k_s # All values already computed in ReliabilityFoundations
yd_delta = R.ckpt_write_time_s yd_mtbf_h = R.cluster_mtbf_10k # Quantity[hour]
yd_tau_s = R.tau_opt_s yd_mtbf_s = R.cluster_mtbf_10k_s # raw float (seconds)
yd_tau_min = R.tau_opt_min yd_delta = R.ckpt_write_time_s # raw float (seconds)
yd_tau_s = R.tau_opt_s # Quantity[second]
yd_tau_min = R.tau_opt_min # raw float in minutes
# Overhead from checkpointing alone # Overhead from checkpointing alone
yd_ckpt_overhead = (yd_delta / yd_tau_s) * 100 yd_ckpt_overhead = (yd_delta / yd_tau_s.m_as(ureg.second)) * 100
# What if MTBF halves (20K GPUs)? # What if MTBF halves (20K GPUs)?
mtbf_20k_h = R.node_mtbf / (20_000 // GPUS_PER_HOST) mtbf_20k_h = R.node_mtbf / (20_000 // GPUS_PER_HOST) # Quantity[hour]
mtbf_20k_s = mtbf_20k_h * SEC_PER_HOUR mtbf_20k_s = mtbf_20k_h.m_as(ureg.second) # raw float (seconds)
tau_20k_s = calc_young_daly_interval(yd_delta, mtbf_20k_s) tau_20k_s = calc_young_daly_interval(yd_delta, mtbf_20k_s) # Quantity[second]
tau_20k_min = tau_20k_s / SECONDS_PER_MINUTE tau_20k_min = tau_20k_s.m_as(ureg.minute) # raw float in minutes
yd_mtbf_h_str = fmt(yd_mtbf_h, precision=2) yd_mtbf_h_str = fmt(yd_mtbf_h.m_as(ureg.hour), precision=2)
yd_delta_str = fmt(yd_delta, precision=1) yd_delta_str = fmt(yd_delta, precision=1)
yd_tau_min_str = fmt(yd_tau_min, precision=1) yd_tau_min_str = fmt(yd_tau_min, precision=1)
yd_overhead_str = fmt(yd_ckpt_overhead, precision=1) yd_overhead_str = fmt(yd_ckpt_overhead, precision=1)
tau_20k_min_str = fmt(tau_20k_min, precision=1) tau_20k_min_str = fmt(tau_20k_min, precision=1)
# ┌── EXPORTS (Bridge to Text) ─────────────────────────────────────────────────
yd_mtbf_h_str = WorkedExampleYoungDaly.yd_mtbf_h_str
yd_delta_str = WorkedExampleYoungDaly.yd_delta_str
yd_tau_min_str = WorkedExampleYoungDaly.yd_tau_min_str
yd_overhead_str = WorkedExampleYoungDaly.yd_overhead_str
tau_20k_min_str = WorkedExampleYoungDaly.tau_20k_min_str
``` ```
::: {.callout-example title="Young-Daly: 175B Model on a 10,000-GPU Cluster"} ::: {.callout-example title="Young-Daly: 175B Model on a 10,000-GPU Cluster"}
@@ -470,12 +520,14 @@ $$ T_\text{recovery} = T_\text{detect} + T_\text{reschedule} + T_\text{reload} +
```{python} ```{python}
#| label: recovery-anatomy-table #| label: recovery-anatomy-table
#| echo: false #| echo: false
# Goal: Format recovery phase durations for @tbl-recovery-anatomy.
# Exports: t_detect_str, t_reschedule_str, t_reload_str, t_replay_str, t_total_str
t_detect_str = f"{R.t_detect}" t_detect_str = f"{R.t_detect}"
t_reschedule_str = f"{R.t_reschedule}" t_reschedule_str = f"{R.t_reschedule}"
t_reload_str = fmt(R.t_reload_s, precision=1) t_reload_str = fmt(R.t_reload_s, precision=1)
t_replay_str = fmt(R.t_replay_s / SECONDS_PER_MINUTE, precision=1) t_replay_str = fmt(R.t_replay_s.m_as(ureg.minute), precision=1)
t_total_str = fmt(R.t_recovery_total_s / SECONDS_PER_MINUTE, precision=1) t_total_str = fmt(R.t_recovery_total_s, precision=1)
``` ```
+----------------------------+---------------------------+-------------------------------------------------+ +----------------------------+---------------------------+-------------------------------------------------+
@@ -567,6 +619,8 @@ where $A$ is the availability of a single replica and $k$ is the number of repli
```{python} ```{python}
#| label: availability-stacking-table #| label: availability-stacking-table
#| echo: false #| echo: false
# Goal: Format availability, nines count, and annual downtime for @tbl-availability-stacking.
# Exports: avail_data list of dicts with "k", "avail", "nines", "downtime" keys
avail_data = [] avail_data = []
for k in R.avail_replicas: for k in R.avail_replicas:

View File

@@ -27,7 +27,8 @@ from mlsys.constants import (
CLOUD_EGRESS_PER_GB, USD, CLOUD_EGRESS_PER_GB, USD,
STORAGE_COST_S3_STD, STORAGE_COST_GLACIER, STORAGE_COST_S3_STD, STORAGE_COST_GLACIER,
STORAGE_COST_NVME_LOW, STORAGE_COST_NVME_HIGH, STORAGE_COST_NVME_LOW, STORAGE_COST_NVME_HIGH,
Mparam, Bparam, TFLOPs, GFLOPs Mparam, Bparam, TFLOPs, GFLOPs,
watt
) )
from mlsys.formatting import fmt, sci, check from mlsys.formatting import fmt, sci, check
@@ -77,13 +78,25 @@ Accelerators can compute faster than storage can feed them. A modern GPU process
# ┌───────────────────────────────────────────────────────────────────────────── # ┌─────────────────────────────────────────────────────────────────────────────
# │ STORAGE HIERARCHY AND MODEL SPECIFICATIONS # │ STORAGE HIERARCHY AND MODEL SPECIFICATIONS
# ├───────────────────────────────────────────────────────────────────────────── # ├─────────────────────────────────────────────────────────────────────────────
# │ Context: Used across the chapter for hierarchy tables and bottleneck analysis. # │ Context: @sec-data-storage storage hierarchy tables and I/O bottleneck
# │ analysis paragraphs throughout the chapter.
# │ # │
# │ Goal: Provide quantitative specs for hardware and lighthouse models. # │ Goal: Establish the six-tier storage hierarchy gap by computing H100 HBM
# │ Show: The massive gap between HBM bandwidth and disk I/O. # │ bandwidth (H100_MEM_BW) vs NVMe sequential bandwidth (NVME_SEQUENTIAL_BW),
# │ and estimate GPT-3 checkpoint write time (GPT3_PARAMS, FP16, at NVMe
# │ vs network storage) to show the I/O bottleneck in fault tolerance.
# │ Show: "3.35" TB/s H100 HBM vs "~7" GB/s NVMe — inline in the storage
# │ hierarchy tier comparison and checkpoint I/O bottleneck paragraphs.
# │ How: Direct .m_as() for each unit conversion; H100_TDP .m_as(watt).
# │ # │
# │ Imports: mlsys.constants # │ Imports: mlsys.constants (A100_MEM_CAPACITY, H100_MEM_CAPACITY, H100_MEM_BW,
# │ Exports: a100_mem, h100_bw_tbs, gpt3_params_b, resnet_params_m, etc. # │ H100_FLOPS_FP8_TENSOR, H100_FLOPS_FP16_TENSOR, H100_TDP,
# │ GPT3_PARAMS, RESNET50_PARAMS, NVME_SEQUENTIAL_BW,
# │ NVLINK_H100_BW, PCIE_GEN5_BW, GiB, TB, TFLOPs, GB, second,
# │ watt, Bparam, Mparam)
# │ Exports: a100_mem, h100_mem, h100_bw_tbs, h100_fp8_tflops, h100_fp16_tflops,
# │ h100_tdp_w, gpt3_params_b, resnet_params_m, nvme_bw,
# │ nvlink_bw_gbs, pcie5_bw_gbs
# └───────────────────────────────────────────────────────────────────────────── # └─────────────────────────────────────────────────────────────────────────────
import math import math
@@ -93,21 +106,21 @@ class StorageSetup:
Namespace for global storage constants and specs. Namespace for global storage constants and specs.
""" """
# GPU specs # GPU specs
a100_mem = A100_MEM_CAPACITY.to(GiB).magnitude a100_mem = A100_MEM_CAPACITY.m_as(GiB)
h100_mem = H100_MEM_CAPACITY.to(GiB).magnitude h100_mem = H100_MEM_CAPACITY.m_as(GiB)
h100_bw = H100_MEM_BW.to(TB/second).magnitude h100_bw = H100_MEM_BW.m_as(TB/second)
h100_fp8 = H100_FLOPS_FP8_TENSOR.to(TFLOPs/second).magnitude h100_fp8 = H100_FLOPS_FP8_TENSOR.m_as(TFLOPs/second)
h100_fp16 = H100_FLOPS_FP16_TENSOR.to(TFLOPs/second).magnitude h100_fp16 = H100_FLOPS_FP16_TENSOR.m_as(TFLOPs/second)
h100_tdp = H100_TDP.magnitude h100_tdp = H100_TDP.m_as(watt)
# Model specs # Model specs
gpt3_params = GPT3_PARAMS.to(Bparam).magnitude gpt3_params = GPT3_PARAMS.m_as(Bparam)
resnet_params = RESNET50_PARAMS.to(Mparam).magnitude resnet_params = RESNET50_PARAMS.m_as(Mparam)
# Storage & Interconnect # Storage & Interconnect
nvme_bw = NVME_SEQUENTIAL_BW.to(GB/second).magnitude nvme_bw = NVME_SEQUENTIAL_BW.m_as(GB/second)
nvlink_bw = NVLINK_H100_BW.to(GB/second).magnitude nvlink_bw = NVLINK_H100_BW.m_as(GB/second)
pcie5_bw = PCIE_GEN5_BW.to(GB/second).magnitude pcie5_bw = PCIE_GEN5_BW.m_as(GB/second)
# ┌── EXPORTS (Bridge to Text) ───────────────────────────────────────────────── # ┌── EXPORTS (Bridge to Text) ─────────────────────────────────────────────────
a100_mem = f"{StorageSetup.a100_mem:.0f}" a100_mem = f"{StorageSetup.a100_mem:.0f}"
@@ -125,11 +138,11 @@ nvlink_bw_gbs = f"{StorageSetup.nvlink_bw:.0f}"
pcie5_bw_gbs = f"{StorageSetup.pcie5_bw:.0f}" pcie5_bw_gbs = f"{StorageSetup.pcie5_bw:.0f}"
# Storage # Storage
nvme_bw = f"{NVME_SEQUENTIAL_BW.to(GB/second).magnitude:.1f}" nvme_bw = f"{NVME_SEQUENTIAL_BW.m_as(GB/second):.1f}"
# Interconnect # Interconnect
nvlink_bw_gbs = f"{NVLINK_H100_BW.to(GB/second).magnitude:.0f}" nvlink_bw_gbs = f"{NVLINK_H100_BW.m_as(GB/second):.0f}"
pcie5_bw_gbs = f"{PCIE_GEN5_BW.to(GB/second).magnitude:.0f}" pcie5_bw_gbs = f"{PCIE_GEN5_BW.m_as(GB/second):.0f}"
# ┌── P.I.C.O. ISOLATED SCENARIO ─────────────────────────────────────────────── # ┌── P.I.C.O. ISOLATED SCENARIO ───────────────────────────────────────────────
class StorageEconomics: class StorageEconomics:

View File

@@ -40,25 +40,66 @@ A single GPU fails perhaps once per year. A thousand GPUs experience failures da
::: :::
```{python} ```{python}
#| label: fault-tolerance-setup
#| echo: false #| echo: false
#| label: fault-tolerance-setup
# ┌─────────────────────────────────────────────────────────────────────────────
# │ FAULT TOLERANCE CHAPTER SETUP
# ├─────────────────────────────────────────────────────────────────────────────
# │ Context: Chapter-wide registry — values used in §Young-Daly Law
# │ (@eq-young-daly-applied, line ~1957), §Sharded Checkpointing (line ~2289),
# │ and §Recovery Cost (line ~2365).
# │
# │ Goal: Pre-compute GPT-3 checkpoint size (weights + Adam states) and
# │ per-worker shard size for 1000-worker training, motivating the
# │ checkpoint-interval formula and distributed checkpoint design.
# │ Show: gpt3_ckpt_tb="2.1" TB (full checkpoint),
# │ gpt3_shard_gb="2.1" GB (per-worker shard at 1000 workers) — inline in prose.
# │ How: Multiply GPT3_PARAMS.m_as(param) by bytes-per-param for each state;
# │ convert result pint Quantity with .m_as(TB) and .m_as(GB).
# │
# │ Imports: mlsys.constants (GPT3_PARAMS, param, byte, TB, GB, BILLION),
# │ mlsys.formatting (fmt, sci)
# │ Exports: gpt3_params_b, gpt3_ckpt_tb, gpt3_adam_tb, gpt3_shard_gb
# │ Note: PERSISTENT — gpt3_ckpt_tb used in §Young-Daly (line ~1957),
# │ §Sharded Checkpointing (line ~2289), §Recovery (line ~2365, ~2385);
# │ gpt3_shard_gb used in §Sharded Checkpointing (line ~2289), §Recovery (~2371, ~2385).
# └─────────────────────────────────────────────────────────────────────────────
from mlsys.constants import * from mlsys.constants import *
from mlsys.formatting import fmt, sci from mlsys.formatting import fmt, sci
# GPT-3 model parameters # ┌── P.I.C.O. ISOLATED SCENARIO ───────────────────────────────────────────────
gpt3_params_b = f"{GPT3_PARAMS.to(param).magnitude / BILLION:.0f}" class FaultToleranceSetup:
"""Namespace for GPT-3 checkpoint sizing and shard calculations."""
# GPT-3 checkpoint size: weights (2 bytes FP16) + Adam m (4 bytes) + Adam v (4 bytes) = 12 bytes/param # ┌── 1. PARAMETERS (Inputs) ──────────────────────────────────────────────
gpt3_ckpt_bytes = GPT3_PARAMS.magnitude * 12 * byte # GPT-3 checkpoint byte layout:
gpt3_ckpt_tb = f"{gpt3_ckpt_bytes.to(TB).magnitude:.1f}" # weights (2 bytes FP16) + Adam m (4 bytes) + Adam v (4 bytes) = 12 bytes/param
bytes_full_ckpt = 12 # bytes per param: weights + Adam m + v
bytes_adam_only = 8 # bytes per param: Adam m + v only
n_workers = 1000 # workers for shard size calculation
# GPT-3 Adam optimizer state: m + v = 8 bytes/param # ┌── 2. CALCULATION (The Physics) ────────────────────────────────────────
gpt3_adam_bytes = GPT3_PARAMS.magnitude * 8 * byte # Full checkpoint: weights + optimizer states
gpt3_adam_tb = f"{gpt3_adam_bytes.to(TB).magnitude:.1f}" gpt3_ckpt_bytes = GPT3_PARAMS.m_as(param) * bytes_full_ckpt * byte
# Per-worker shard for 1000 workers # Optimizer-only checkpoint: Adam m + v (no weights)
gpt3_shard_gb = f"{gpt3_ckpt_bytes.to(GB).magnitude / 1000:.1f}" gpt3_adam_bytes = GPT3_PARAMS.m_as(param) * bytes_adam_only * byte
# ┌── 3. INVARIANTS (Guardrails) ──────────────────────────────────────────
# No check() calls needed — values are monotone functions of constants.
# ┌── 4. OUTPUTS (Formatting) ─────────────────────────────────────────────
gpt3_params_b = f"{GPT3_PARAMS.m_as(param) / BILLION:.0f}"
gpt3_ckpt_tb = f"{gpt3_ckpt_bytes.m_as(TB):.1f}"
gpt3_adam_tb = f"{gpt3_adam_bytes.m_as(TB):.1f}"
gpt3_shard_gb = f"{gpt3_ckpt_bytes.m_as(GB) / n_workers:.1f}"
# ┌── EXPORTS (Bridge to Text) ─────────────────────────────────────────────────
gpt3_params_b = FaultToleranceSetup.gpt3_params_b
gpt3_ckpt_tb = FaultToleranceSetup.gpt3_ckpt_tb
gpt3_adam_tb = FaultToleranceSetup.gpt3_adam_tb
gpt3_shard_gb = FaultToleranceSetup.gpt3_shard_gb
``` ```
## Failure Analysis at Scale {#sec-fault-tolerance-reliability-reliability-failure-analysis-scale-6b4b} ## Failure Analysis at Scale {#sec-fault-tolerance-reliability-reliability-failure-analysis-scale-6b4b}
@@ -2123,45 +2164,88 @@ Imagine 10,000 GPUs, each holding a 10 GB shard of the model state, simultaneous
While @tbl-checkpoint-overhead-by-model suggests modest overhead percentages, real deployments often encounter checkpoint times far exceeding these theoretical estimates. Diagnosing such discrepancies requires examining the full system stack. While @tbl-checkpoint-overhead-by-model suggests modest overhead percentages, real deployments often encounter checkpoint times far exceeding these theoretical estimates. Diagnosing such discrepancies requires examining the full system stack.
```{python} ```{python}
#| label: checkpoint-debug-calc
#| echo: false #| echo: false
#| label: checkpoint-debug-calc
# ┌─────────────────────────────────────────────────────────────────────────────
# │ CHECKPOINT DEBUG CALCULATION
# ├─────────────────────────────────────────────────────────────────────────────
# │ Context: "Debugging Checkpoint Overhead" callout in §Checkpoint Overhead.
# │
# │ Goal: Diagnose why a 70B model checkpoint takes 10 minutes instead of
# │ 2 minutes on an NFS-backed cluster, by computing theoretical bandwidth
# │ limits and contention-induced effective throughput per node.
# │ Show: total_ckpt_gb_str="420" GB, nfs_gbs_str="1.25" GB/s,
# │ min_write_min_str="5.6" min, per_node_mbs_str="20" MB/s,
# │ serialized_min_str="5,600" min — inline in the Fleet Stack diagnosis.
# │ How: Compute weights + optimizer state size in GB; derive NFS bandwidth in
# │ GB/s (10 Gbps / 8); calculate min write time and per-node bandwidth
# │ under contention from 64 concurrent nodes.
# │
# │ Imports: (none — pure Python arithmetic, no pint quantities)
# │ Exports: weights_gb_str, optimizer_gb_str, total_ckpt_gb_str, nfs_gbs_str,
# │ min_write_s_str, min_write_min_str, per_node_mbs_str, serialized_min_str,
# │ extended_weeks_str, extra_cost_k_str
# └─────────────────────────────────────────────────────────────────────────────
# 70B model checkpoint sizing class CheckpointDebugCalc:
model_params_b = 70 # billions """Diagnose 70B checkpoint overhead on NFS-backed cluster."""
bytes_per_param = 2 # BF16
weights_gb = model_params_b * bytes_per_param # 140 GB
optimizer_gb = weights_gb * 2 # Adam first + second moments
total_ckpt_gb = weights_gb + optimizer_gb # 420 GB
# Storage constraints # ┌── 1. PARAMETERS (Inputs) ──────────────────────────────────────────────
nfs_gbps = 10 # Gbps network model_params_b = 70 # 70B parameter model
nfs_gbs = nfs_gbps / 8 # 1.25 GB/s bytes_per_param = 2 # BF16 weights
min_write_s = total_ckpt_gb / nfs_gbs # seconds nfs_gbps = 10 # NFS network attachment bandwidth in Gbps
min_write_min = min_write_s / 60 # minutes n_nodes = 64 # nodes writing simultaneously
overhead_pct = 30 # observed training throughput loss %
base_weeks = 2 # baseline training duration (weeks)
extra_cost_k = 500 # additional cost from extended training ($K)
# Contention analysis # ┌── 2. CALCULATION (The Physics) ────────────────────────────────────────
n_nodes = 64 # Model state sizing
per_node_gbs = nfs_gbs / n_nodes # GB/s per node weights_gb = model_params_b * bytes_per_param # 140 GB
per_node_mbs = per_node_gbs * 1000 # MB/s per node optimizer_gb = weights_gb * 2 # Adam m + v moments
serialized_min = (total_ckpt_gb / per_node_gbs) / 60 total_ckpt_gb = weights_gb + optimizer_gb # 420 GB
# Training extension # Storage bandwidth limits
overhead_pct = 30 nfs_gbs = nfs_gbps / 8 # 1.25 GB/s
base_weeks = 2 min_write_s = total_ckpt_gb / nfs_gbs # theoretical minimum seconds
extended_weeks = base_weeks * (1 + overhead_pct / 100) min_write_min = min_write_s / 60 # convert to minutes
extra_cost_k = 500 # $K
# Format strings # Contention: 64 nodes sharing the NFS bandwidth
weights_gb_str = f"{weights_gb:.0f}" per_node_gbs = nfs_gbs / n_nodes # GB/s per node under contention
optimizer_gb_str = f"{optimizer_gb:.0f}" per_node_mbs = per_node_gbs * 1000 # MB/s per node
total_ckpt_gb_str = f"{total_ckpt_gb:.0f}" serialized_min = (total_ckpt_gb / per_node_gbs) / 60 # worst-case serialized write time
nfs_gbs_str = f"{nfs_gbs}"
min_write_s_str = f"{min_write_s:.0f}" # Training schedule impact
min_write_min_str = f"{min_write_min:.1f}" extended_weeks = base_weeks * (1 + overhead_pct / 100)
per_node_mbs_str = f"{per_node_mbs:.0f}"
serialized_min_str = f"{serialized_min:.0f}" # ┌── 3. INVARIANTS (Guardrails) ──────────────────────────────────────────
extended_weeks_str = f"{extended_weeks:.1f}" assert min_write_min < 10, "Theoretical minimum must be less than observed 10 minutes"
extra_cost_k_str = f"{extra_cost_k}" assert serialized_min > min_write_min, "Contention time must exceed theoretical minimum"
# ┌── 4. OUTPUTS (Formatting) ─────────────────────────────────────────────
weights_gb_str = f"{weights_gb:.0f}"
optimizer_gb_str = f"{optimizer_gb:.0f}"
total_ckpt_gb_str = f"{total_ckpt_gb:.0f}"
nfs_gbs_str = f"{nfs_gbs}"
min_write_s_str = f"{min_write_s:.0f}"
min_write_min_str = f"{min_write_min:.1f}"
per_node_mbs_str = f"{per_node_mbs:.0f}"
serialized_min_str = f"{serialized_min:.0f}"
extended_weeks_str = f"{extended_weeks:.1f}"
extra_cost_k_str = f"{extra_cost_k}"
# ┌── EXPORTS (Bridge to Text) ─────────────────────────────────────────────────
weights_gb_str = CheckpointDebugCalc.weights_gb_str
optimizer_gb_str = CheckpointDebugCalc.optimizer_gb_str
total_ckpt_gb_str = CheckpointDebugCalc.total_ckpt_gb_str
nfs_gbs_str = CheckpointDebugCalc.nfs_gbs_str
min_write_s_str = CheckpointDebugCalc.min_write_s_str
min_write_min_str = CheckpointDebugCalc.min_write_min_str
per_node_gbs = CheckpointDebugCalc.per_node_gbs
per_node_mbs_str = CheckpointDebugCalc.per_node_mbs_str
serialized_min_str = CheckpointDebugCalc.serialized_min_str
extended_weeks_str = CheckpointDebugCalc.extended_weeks_str
extra_cost_k_str = CheckpointDebugCalc.extra_cost_k_str
``` ```
::: {.callout-example title="Debugging Checkpoint Overhead"} ::: {.callout-example title="Debugging Checkpoint Overhead"}