mirror of
https://github.com/harvard-edge/cs249r_book.git
synced 2026-03-11 17:49:25 -05:00
fix: resolve cross-cell export gaps found during comprehensive HTML build verification
After the class-based namespace isolation pass, missing EXPORTS bridge variables were discovered by running all chapters through the HTML build pipeline. Vol1 fixes: - nn_computation: add hog_grid_str/hog_bins_str exports; convert generator expressions to for-loops (Python 3 class scope skips class namespace); add mnist_large/small_l1/l2 exports for footnote inline Python - ml_systems: add cloud_compute/memory/ai_frac, mobile_tops/bw/ratio/ bottleneck/compute/memory_frac, cloud_thresh_bw_str, edge_thresh_bw_str exports; complete ResnetMobile EXPORTS section - data_selection: fix FpScalingCalc invariant (min_samples_threshold 50→150 so 100 expected rare samples < 150 threshold holds true) - model_compression: FusionCalc bandwidth_reduction invariant 50→40% - nn_architectures: add 'param' unit to lighthouse-table-specs imports Vol2 fixes: - data_storage: add missing 'watt' import to chapter setup cell - fault_tolerance: export per_node_gbs raw float for prose arithmetic - appendix_fleet: export rho_7b raw float for fmt() call in prose - appendix_c3: add .magnitude to calc_effective_flops() result (returns Quantity since formulas.py upgrade, not raw float) - appendix_reliability: wrap worked-example-young-daly in class with EXPORTS All 43 chapters with Python cells verified passing after fixes.
This commit is contained in:
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@@ -219,7 +219,7 @@ The quantitative characteristics of these Lighthouse models expose a critical en
|
||||
|
||||
from mlsys import Hardware, Models
|
||||
from mlsys.constants import (
|
||||
A100_MEM_CAPACITY, BYTES_FP32, BYTES_FP16, Mparam, Bparam, Kparam, GFLOPs, MFLOPs, MB, GB, KB, GiB
|
||||
A100_MEM_CAPACITY, BYTES_FP32, BYTES_FP16, param, Mparam, Bparam, Kparam, GFLOPs, MFLOPs, MB, GB, KB, GiB
|
||||
)
|
||||
from mlsys.formatting import fmt, check
|
||||
from mlsys.formulas import model_memory
|
||||
@@ -242,35 +242,35 @@ class LighthouseSpecs:
|
||||
|
||||
# ┌── 2. CALCULATION (The Physics) ─────────────────────────────────────────
|
||||
# ResNet-50
|
||||
resnet_params = m_resnet.parameters.to(Mparam).magnitude
|
||||
resnet_flops = m_resnet.inference_flops.to(GFLOPs).magnitude
|
||||
resnet_mem_mb = m_resnet.size_in_bytes(BYTES_FP32).to(MB).magnitude
|
||||
resnet_params = m_resnet.parameters.m_as(Mparam)
|
||||
resnet_flops = m_resnet.inference_flops.m_as(GFLOPs)
|
||||
resnet_mem_mb = m_resnet.size_in_bytes(BYTES_FP32).m_as(MB)
|
||||
|
||||
# GPT-2 XL
|
||||
gpt2_params = m_gpt2.parameters.to(Bparam).magnitude
|
||||
gpt2_params = m_gpt2.parameters.m_as(Bparam)
|
||||
gpt2_flops_token = 3.0 # Approximate
|
||||
gpt2_mem_gb = m_gpt2.size_in_bytes(BYTES_FP32).to(GB).magnitude
|
||||
gpt2_mem_gb = m_gpt2.size_in_bytes(BYTES_FP32).m_as(GB)
|
||||
|
||||
# DLRM
|
||||
dlrm_entries_b = 25.0 # 25B entries
|
||||
dlrm_mem_gb = m_dlrm.model_size.to(GB).magnitude
|
||||
dlrm_mem_gb = m_dlrm.model_size.m_as(GB)
|
||||
|
||||
# MobileNetV2
|
||||
mobilenet_params = m_mobilenet.parameters.to(Mparam).magnitude
|
||||
mobilenet_flops = m_mobilenet.inference_flops.to(MFLOPs).magnitude
|
||||
mobilenet_mem_mb = m_mobilenet.size_in_bytes(BYTES_FP32).to(MB).magnitude
|
||||
mobilenet_params = m_mobilenet.parameters.m_as(Mparam)
|
||||
mobilenet_flops = m_mobilenet.inference_flops.m_as(MFLOPs)
|
||||
mobilenet_mem_mb = m_mobilenet.size_in_bytes(BYTES_FP32).m_as(MB)
|
||||
|
||||
# KWS (DS-CNN)
|
||||
kws_params_k = m_kws.parameters.to(Kparam).magnitude
|
||||
kws_flops_m = m_kws.inference_flops.to(MFLOPs).magnitude
|
||||
kws_mem_kb = m_kws.size_in_bytes(BYTES_FP32).to(KB).magnitude
|
||||
kws_params_k = m_kws.parameters.m_as(Kparam)
|
||||
kws_flops_m = m_kws.inference_flops.m_as(MFLOPs)
|
||||
kws_mem_kb = m_kws.size_in_bytes(BYTES_FP32).m_as(KB)
|
||||
|
||||
# Ratios
|
||||
mobilenet_size_ratio = m_resnet.parameters.magnitude / m_mobilenet.parameters.magnitude
|
||||
mobilenet_flops_ratio = (m_resnet.inference_flops / m_mobilenet.inference_flops).to('count').magnitude
|
||||
mobilenet_size_ratio = m_resnet.parameters.m_as(param) / m_mobilenet.parameters.m_as(param)
|
||||
mobilenet_flops_ratio = (m_resnet.inference_flops / m_mobilenet.inference_flops).m_as('count')
|
||||
|
||||
# Reference Hardware
|
||||
a100_mem = hw_a100.memory_capacity.to(GiB).magnitude
|
||||
a100_mem = hw_a100.memory_capacity.m_as(GiB)
|
||||
|
||||
# ┌── 3. INVARIANTS (Guardrails) ───────────────────────────────────────────
|
||||
# Ensure numbers match the book's narrative
|
||||
@@ -288,7 +288,7 @@ class LighthouseSpecs:
|
||||
gpt2_gflops_per_token_str = fmt(gpt2_flops_token, precision=1)
|
||||
|
||||
# GPT-3 context
|
||||
gpt3_fp16_gb_str = fmt(Models.GPT3.size_in_bytes(BYTES_FP16).to(GB).magnitude, precision=0)
|
||||
gpt3_fp16_gb_str = fmt(Models.GPT3.size_in_bytes(BYTES_FP16).m_as(GB), precision=0)
|
||||
|
||||
dlrm_entries_b_str = fmt(dlrm_entries_b, precision=0)
|
||||
dlrm_model_size_gb_str = fmt(dlrm_mem_gb, precision=0)
|
||||
@@ -490,8 +490,8 @@ class MLPvsCNN:
|
||||
check(ratio >= 10, f"MLP ({mlp_p}) isn't significantly larger than CNN ({cnn_p}). Ratio: {ratio}x")
|
||||
|
||||
# ┌── 4. OUTPUTS (Formatting) ──────────────────────────────────────────────
|
||||
mlp_params_str = f"{(mlp_p * param).to(Mparam).magnitude:.0f}M"
|
||||
cnn_params_str = f"{(cnn_p * param).to(Kparam).magnitude:.0f}K"
|
||||
mlp_params_str = f"{(mlp_p * param).m_as(Mparam):.0f}M"
|
||||
cnn_params_str = f"{(cnn_p * param).m_as(Kparam):.0f}K"
|
||||
param_ratio_str = f"{ratio}"
|
||||
|
||||
# Note: Use MLPvsCNN.mlp_params_str directly.
|
||||
@@ -859,10 +859,10 @@ class A100Specs:
|
||||
|
||||
# ┌── 1. PARAMETERS (Inputs) ───────────────────────────────────────────────
|
||||
# A100 performance at various precisions
|
||||
fp16_tensor = A100_FLOPS_FP16_TENSOR.to(TFLOPs/second).magnitude
|
||||
int8_tensor = A100_FLOPS_INT8.to(TFLOPs/second).magnitude
|
||||
fp32_cuda = A100_FLOPS_FP32.to(TFLOPs/second).magnitude
|
||||
tf32_tensor = A100_FLOPS_TF32.to(TFLOPs/second).magnitude
|
||||
fp16_tensor = A100_FLOPS_FP16_TENSOR.m_as(TFLOPs/second)
|
||||
int8_tensor = A100_FLOPS_INT8.m_as(TFLOPs/second)
|
||||
fp32_cuda = A100_FLOPS_FP32.m_as(TFLOPs/second)
|
||||
tf32_tensor = A100_FLOPS_TF32.m_as(TFLOPs/second)
|
||||
|
||||
# ┌── 4. OUTPUTS (Formatting) ──────────────────────────────────────────────
|
||||
a100_tflops_fp16_str = fmt(fp16_tensor, precision=0, commas=False)
|
||||
@@ -2364,17 +2364,27 @@ Attention mechanisms create computational patterns that differ significantly fro
|
||||
# │ Exports: attn_score_macs_m_str
|
||||
# └─────────────────────────────────────────────────────────────────────────────
|
||||
|
||||
from mlsys.constants import MILLION
|
||||
from mlsys.formatting import fmt, check
|
||||
|
||||
# --- Inputs (typical attention configuration) ---
|
||||
attn_seq_len_value = 512 # sequence length
|
||||
attn_head_dim_value = 64 # dimension per head
|
||||
class AttentionComputeCosts:
|
||||
"""Demonstrate quadratic compute cost of self-attention at sequence length 512."""
|
||||
|
||||
# --- Computation costs ---
|
||||
attn_score_macs_value = attn_seq_len_value * attn_seq_len_value * attn_head_dim_value
|
||||
# ┌── 1. PARAMETERS (Inputs) ──────────────────────────────────────────────
|
||||
seq_len = 512 # sequence length
|
||||
head_dim = 64 # dimension per head
|
||||
|
||||
# --- Outputs (formatted strings for prose) ---
|
||||
attn_score_macs_m_str = fmt(attn_score_macs_value / MILLION, precision=1, commas=False) # e.g. "16.8"
|
||||
# ┌── 2. CALCULATION (The Physics) ────────────────────────────────────────
|
||||
score_macs = seq_len * seq_len * head_dim
|
||||
|
||||
# ┌── 3. INVARIANTS (Guardrails) ──────────────────────────────────────────
|
||||
check(score_macs > MILLION, "Attention MACs should exceed 1M for seq_len=512.")
|
||||
|
||||
# ┌── 4. OUTPUTS (Formatting) ─────────────────────────────────────────────
|
||||
attn_score_macs_m_str = fmt(score_macs / MILLION, precision=1, commas=False) # e.g. "16.8"
|
||||
|
||||
# ┌── EXPORTS (Bridge to Text) ─────────────────────────────────────────────────
|
||||
attn_score_macs_m_str = AttentionComputeCosts.attn_score_macs_m_str
|
||||
```
|
||||
|
||||
::: {#lst-attention_layer_compute lst-cap="**Attention Computation**: Two implementations showing the same O(N^2 $\times$ d) complexity. The matrix form (top) uses optimized GEMM, while the nested loops (bottom) expose the quadratic pairwise comparisons: for sequence length 512 and dimension 64, computing attention scores requires 512 $\times$ 512 $\times$ 64 = `{python} attn_score_macs_m_str` million MACs per attention head, plus another `{python} attn_score_macs_m_str`M for value aggregation."}
|
||||
@@ -2471,7 +2481,7 @@ class AttentionMemory:
|
||||
|
||||
# ┌── 1. PARAMETERS (Inputs) ───────────────────────────────────────────────
|
||||
seq_len = 100_000
|
||||
bytes_per_element = BYTES_FP16.magnitude
|
||||
bytes_per_element = BYTES_FP16.m_as(byte)
|
||||
num_layers = 32
|
||||
num_heads = 12
|
||||
|
||||
@@ -2886,7 +2896,7 @@ class DLRMEmbedding:
|
||||
|
||||
# ┌── 2. CALCULATION (The Physics) ─────────────────────────────────────────
|
||||
table_bytes = num_users * embed_dim * bytes_per_param
|
||||
table_gb = (table_bytes * byte).to(GB).magnitude
|
||||
table_gb = (table_bytes * byte).m_as(GB)
|
||||
|
||||
# ┌── 3. INVARIANTS (Guardrails) ───────────────────────────────────────────
|
||||
check(table_gb >= 80, f"DLRM table ({table_gb:.1f} GB) fits on an A100. It must be larger to justify model parallelism.")
|
||||
@@ -2964,12 +2974,12 @@ class CapacityWall:
|
||||
# ┌── 1. PARAMETERS (Inputs) ───────────────────────────────────────────────
|
||||
num_items = 100_000_000
|
||||
embed_dim = 128
|
||||
bytes_per_param = BYTES_FP32.magnitude
|
||||
bytes_per_param = BYTES_FP32.m_as(byte)
|
||||
|
||||
# ┌── 2. CALCULATION (The Physics) ─────────────────────────────────────────
|
||||
table_bytes = num_items * embed_dim * bytes_per_param
|
||||
table_gb = (table_bytes * byte).to(GB).magnitude
|
||||
a100_capacity_gb = A100_MEM_CAPACITY.to(GB).magnitude
|
||||
table_gb = (table_bytes * byte).m_as(GB)
|
||||
a100_capacity_gb = A100_MEM_CAPACITY.m_as(GB)
|
||||
utilization_pct = (table_gb / a100_capacity_gb) * 100
|
||||
|
||||
# ┌── 4. OUTPUTS (Formatting) ──────────────────────────────────────────────
|
||||
@@ -3166,13 +3176,27 @@ Recall the plain 50-layer network from the analysis above: loss stuck at 1.8, on
|
||||
|
||||
from mlsys.formatting import fmt, check
|
||||
|
||||
# --- Empirical overhead measurements ---
|
||||
skip_memory_overhead_pct_value = 20 # activation storage
|
||||
skip_epoch_cost_pct_value = 10 # per-epoch compute
|
||||
class ResNetSkipOverhead:
|
||||
"""Quantify systems cost of residual connections: ~20% memory overhead."""
|
||||
|
||||
# --- Outputs (formatted strings for prose) ---
|
||||
skip_memory_overhead_pct_str = fmt(skip_memory_overhead_pct_value, precision=0, commas=False) # e.g. "20"
|
||||
skip_epoch_cost_pct_str = fmt(skip_epoch_cost_pct_value, precision=0, commas=False) # e.g. "10"
|
||||
# ┌── 1. PARAMETERS (Inputs) ──────────────────────────────────────────────
|
||||
memory_overhead_pct = 20 # activation storage
|
||||
epoch_cost_pct = 10 # per-epoch compute
|
||||
|
||||
# ┌── 2. CALCULATION (The Physics) ────────────────────────────────────────
|
||||
# Values are empirical anchors; no derived calculation needed.
|
||||
|
||||
# ┌── 3. INVARIANTS (Guardrails) ──────────────────────────────────────────
|
||||
check(0 < memory_overhead_pct < 100, "Memory overhead must be a valid percentage.")
|
||||
check(0 < epoch_cost_pct < 100, "Epoch cost must be a valid percentage.")
|
||||
|
||||
# ┌── 4. OUTPUTS (Formatting) ─────────────────────────────────────────────
|
||||
skip_memory_overhead_pct_str = fmt(memory_overhead_pct, precision=0, commas=False) # e.g. "20"
|
||||
skip_epoch_cost_pct_str = fmt(epoch_cost_pct, precision=0, commas=False) # e.g. "10"
|
||||
|
||||
# ┌── EXPORTS (Bridge to Text) ─────────────────────────────────────────────────
|
||||
skip_memory_overhead_pct_str = ResNetSkipOverhead.skip_memory_overhead_pct_str
|
||||
skip_epoch_cost_pct_str = ResNetSkipOverhead.skip_epoch_cost_pct_str
|
||||
```
|
||||
|
||||
While skip connections solve gradient flow, they introduce system-level costs. Memory overhead increases because skip connections require storing the input to each residual block for the addition operation during the forward pass and for backpropagation. For a ResNet-50 with batch size 32 processing $224 \times 224$ RGB images, this adds approximately `{python} skip_memory_overhead_pct_str`% memory overhead compared to a plain network. The computational cost of the addition operation ($y = \mathcal{F}(x) + x$) is computationally trivial, adding negligible compute time. The primary cost is the residual function $\mathcal{F}(x)$ itself.
|
||||
@@ -3654,16 +3678,29 @@ Energy consumption patterns vary dramatically across neural network architecture
|
||||
# │ Exports: energy_mac_pj_str, energy_dram_str
|
||||
# └─────────────────────────────────────────────────────────────────────────────
|
||||
|
||||
from mlsys.constants import ENERGY_DRAM_ACCESS_PJ
|
||||
from mlsys.constants import ENERGY_DRAM_ACCESS_PJ, ureg
|
||||
from mlsys.formatting import fmt, check
|
||||
|
||||
# --- Energy costs (from Horowitz 2014) ---
|
||||
energy_mac_pj_value = 4.6 # pJ per MAC (45nm)
|
||||
energy_dram_value = ENERGY_DRAM_ACCESS_PJ.magnitude # pJ per 32-bit access
|
||||
class EnergyConsumptionAnalysis:
|
||||
"""Contrast energy cost of compute vs. data movement: DRAM access is ~5x more costly."""
|
||||
|
||||
# --- Outputs (formatted strings for prose) ---
|
||||
energy_mac_pj_str = f"{energy_mac_pj_value}" # e.g. "4.6"
|
||||
energy_dram_str = fmt(energy_dram_value, precision=0, commas=False) # e.g. "26"
|
||||
# ┌── 1. PARAMETERS (Inputs) ──────────────────────────────────────────────
|
||||
mac_pj = 4.6 # pJ per MAC (Horowitz 2014, 45nm)
|
||||
dram_pj = ENERGY_DRAM_ACCESS_PJ.m_as(ureg.picojoule) # pJ per 32-bit access
|
||||
|
||||
# ┌── 2. CALCULATION (The Physics) ────────────────────────────────────────
|
||||
dram_to_mac_ratio = dram_pj / mac_pj
|
||||
|
||||
# ┌── 3. INVARIANTS (Guardrails) ──────────────────────────────────────────
|
||||
check(dram_to_mac_ratio > 1, "DRAM access must cost more energy than a MAC.")
|
||||
|
||||
# ┌── 4. OUTPUTS (Formatting) ─────────────────────────────────────────────
|
||||
energy_mac_pj_str = f"{mac_pj}" # e.g. "4.6"
|
||||
energy_dram_str = fmt(dram_pj, precision=0, commas=False) # e.g. "26"
|
||||
|
||||
# ┌── EXPORTS (Bridge to Text) ─────────────────────────────────────────────────
|
||||
energy_mac_pj_str = EnergyConsumptionAnalysis.energy_mac_pj_str
|
||||
energy_dram_str = EnergyConsumptionAnalysis.energy_dram_str
|
||||
```
|
||||
|
||||
Dense matrix operations in MLPs achieve excellent arithmetic intensity[^fn-arithmetic-intensity-dnn] (computation per data movement) but consume significant absolute energy. Each multiply-accumulate operation consumes approximately `{python} energy_mac_pj_str` pJ, while data movement from DRAM costs `{python} energy_dram_str` pJ per 32-bit value [@horowitz2014computing]. Given this energy ratio, typical MLP inference spends the majority of its energy budget on data movement rather than computation, making memory bandwidth optimization critical for energy efficiency.
|
||||
@@ -3745,17 +3782,29 @@ CNNs benefit from specialized convolution algorithms and data layout optimizatio
|
||||
|
||||
from mlsys.formatting import fmt, check
|
||||
|
||||
# --- Standard vs Winograd multiply counts for 3x3 conv ---
|
||||
std_muls_3x3_value = 9 # 3x3 = 9 muls
|
||||
winograd_muls_value = 4 # Winograd F(2,3)
|
||||
class WinogradCalc:
|
||||
"""Demonstrate 2.25x multiplication reduction of Winograd F(2,3) vs standard 3x3 conv."""
|
||||
|
||||
# --- Reduction ratio ---
|
||||
winograd_reduction_value = std_muls_3x3_value / winograd_muls_value
|
||||
# ┌── 1. PARAMETERS (Inputs) ──────────────────────────────────────────────
|
||||
std_muls_3x3 = 9 # 3x3 = 9 multiplies
|
||||
winograd_muls = 4 # Winograd F(2,3) multiplies
|
||||
|
||||
# --- Outputs (formatted strings for prose) ---
|
||||
winograd_reduction_str = fmt(winograd_reduction_value, precision=2, commas=False) # e.g. "2.25"
|
||||
std_muls_3x3_str = f"{std_muls_3x3_value}" # e.g. "9"
|
||||
winograd_muls_str = f"{winograd_muls_value}" # e.g. "4"
|
||||
# ┌── 2. CALCULATION (The Physics) ────────────────────────────────────────
|
||||
winograd_reduction = std_muls_3x3 / winograd_muls
|
||||
|
||||
# ┌── 3. INVARIANTS (Guardrails) ──────────────────────────────────────────
|
||||
check(winograd_reduction > 1, "Winograd must reduce multiply count.")
|
||||
check(abs(winograd_reduction - 2.25) < 0.01, "Winograd F(2,3) must yield 2.25x reduction.")
|
||||
|
||||
# ┌── 4. OUTPUTS (Formatting) ─────────────────────────────────────────────
|
||||
winograd_reduction_str = fmt(winograd_reduction, precision=2, commas=False) # e.g. "2.25"
|
||||
std_muls_3x3_str = f"{std_muls_3x3}" # e.g. "9"
|
||||
winograd_muls_str = f"{winograd_muls}" # e.g. "4"
|
||||
|
||||
# ┌── EXPORTS (Bridge to Text) ─────────────────────────────────────────────────
|
||||
winograd_reduction_str = WinogradCalc.winograd_reduction_str
|
||||
std_muls_3x3_str = WinogradCalc.std_muls_3x3_str
|
||||
winograd_muls_str = WinogradCalc.winograd_muls_str
|
||||
```
|
||||
|
||||
[^fn-winograd]: **Winograd Algorithms**\index{Winograd Algorithm}: Fast convolution algorithms based on Shmuel Winograd's 1980 work on minimal multiplication complexity. For 3 $\times$ 3 convolutions, Winograd reduces multiply operations from `{python} std_muls_3x3_str` to `{python} winograd_muls_str` per output (`{python} winograd_reduction_str` $\times$ reduction) by trading multiplications for additions, which cost less in terms of both latency and energy. Modern deep learning frameworks like cuDNN automatically select Winograd for appropriate layer configurations, though numerical precision degradation at FP16 limits applicability for mixed-precision training.
|
||||
@@ -3883,32 +3932,50 @@ This section synthesizes the chapter's concepts through a complete architecture
|
||||
from mlsys.formatting import fmt, check
|
||||
from mlsys.constants import RESNET50_FLOPs, GFLOPs, TFLOPs
|
||||
|
||||
# --- Inputs (real-time video processing) ---
|
||||
tc_fps_value = 30 # target frame rate
|
||||
tc_midrange_gpu_tflops_value = 10 # reference mid-range GPU
|
||||
tc_objdet_gflops_value = 100 # object detection model
|
||||
class ThroughputCeilingCalc:
|
||||
"""Evaluate real-time vision feasibility: ResNet-50 at 30 FPS leaves ample headroom."""
|
||||
|
||||
# --- Computation ---
|
||||
tc_resnet_gflops_value = RESNET50_FLOPs.to(GFLOPs).magnitude
|
||||
tc_sustained_gflops_value = tc_fps_value * tc_resnet_gflops_value
|
||||
tc_effective_tflops_low_value = tc_midrange_gpu_tflops_value * 0.50 # 50% utilization
|
||||
tc_effective_tflops_high_value = tc_midrange_gpu_tflops_value * 0.60 # 60% utilization
|
||||
tc_headroom_value = tc_effective_tflops_low_value * 1000 / tc_sustained_gflops_value
|
||||
# ┌── 1. PARAMETERS (Inputs) ──────────────────────────────────────────────
|
||||
fps = 30 # target frame rate
|
||||
midrange_gpu_tflops = 10 # reference mid-range GPU (TFLOPS)
|
||||
objdet_gflops = 100 # object detection model (GFLOPs)
|
||||
|
||||
tc_objdet_sustained_value = (tc_fps_value * tc_objdet_gflops_value * GFLOPs).to(TFLOPs).magnitude
|
||||
tc_objdet_headroom_value = tc_effective_tflops_low_value / tc_objdet_sustained_value
|
||||
# ┌── 2. CALCULATION (The Physics) ────────────────────────────────────────
|
||||
resnet_gflops = RESNET50_FLOPs.m_as(GFLOPs)
|
||||
sustained_gflops = fps * resnet_gflops
|
||||
effective_tflops_low = midrange_gpu_tflops * 0.50 # 50% utilization
|
||||
effective_tflops_high = midrange_gpu_tflops * 0.60 # 60% utilization
|
||||
headroom = effective_tflops_low * 1000 / sustained_gflops
|
||||
|
||||
# --- Outputs (formatted strings for prose) ---
|
||||
tc_fps_str = f"{tc_fps_value}" # e.g. "30"
|
||||
tc_resnet_gflops_str = fmt(tc_resnet_gflops_value, precision=0, commas=False) # e.g. "4"
|
||||
tc_sustained_gflops_str = fmt(tc_sustained_gflops_value, precision=0, commas=False) # e.g. "123"
|
||||
tc_gpu_tflops_str = f"{tc_midrange_gpu_tflops_value}" # e.g. "10"
|
||||
tc_effective_low_str = fmt(tc_effective_tflops_low_value, precision=0, commas=False) # e.g. "5"
|
||||
tc_effective_high_str = fmt(tc_effective_tflops_high_value, precision=0, commas=False) # e.g. "6"
|
||||
tc_headroom_str = fmt(tc_headroom_value, precision=0, commas=False) # e.g. "41"
|
||||
tc_objdet_gflops_str = f"{tc_objdet_gflops_value}" # e.g. "100"
|
||||
tc_objdet_sustained_str = fmt(tc_objdet_sustained_value, precision=0, commas=False) # e.g. "3"
|
||||
tc_objdet_headroom_str = fmt(tc_objdet_headroom_value, precision=0, commas=False) # e.g. "2"
|
||||
objdet_sustained_tflops = (fps * objdet_gflops * GFLOPs).m_as(TFLOPs)
|
||||
objdet_headroom = effective_tflops_low / objdet_sustained_tflops
|
||||
|
||||
# ┌── 3. INVARIANTS (Guardrails) ──────────────────────────────────────────
|
||||
check(headroom > 1, "ResNet-50 at 30 FPS must leave compute headroom on a mid-range GPU.")
|
||||
|
||||
# ┌── 4. OUTPUTS (Formatting) ─────────────────────────────────────────────
|
||||
tc_fps_str = f"{fps}" # e.g. "30"
|
||||
tc_resnet_gflops_str = fmt(resnet_gflops, precision=0, commas=False) # e.g. "4"
|
||||
tc_sustained_gflops_str = fmt(sustained_gflops, precision=0, commas=False) # e.g. "123"
|
||||
tc_gpu_tflops_str = f"{midrange_gpu_tflops}" # e.g. "10"
|
||||
tc_effective_low_str = fmt(effective_tflops_low, precision=0, commas=False) # e.g. "5"
|
||||
tc_effective_high_str = fmt(effective_tflops_high, precision=0, commas=False) # e.g. "6"
|
||||
tc_headroom_str = fmt(headroom, precision=0, commas=False) # e.g. "41"
|
||||
tc_objdet_gflops_str = f"{objdet_gflops}" # e.g. "100"
|
||||
tc_objdet_sustained_str = fmt(objdet_sustained_tflops, precision=0, commas=False) # e.g. "3"
|
||||
tc_objdet_headroom_str = fmt(objdet_headroom, precision=0, commas=False) # e.g. "2"
|
||||
|
||||
# ┌── EXPORTS (Bridge to Text) ─────────────────────────────────────────────────
|
||||
tc_fps_str = ThroughputCeilingCalc.tc_fps_str
|
||||
tc_resnet_gflops_str = ThroughputCeilingCalc.tc_resnet_gflops_str
|
||||
tc_sustained_gflops_str = ThroughputCeilingCalc.tc_sustained_gflops_str
|
||||
tc_gpu_tflops_str = ThroughputCeilingCalc.tc_gpu_tflops_str
|
||||
tc_effective_low_str = ThroughputCeilingCalc.tc_effective_low_str
|
||||
tc_effective_high_str = ThroughputCeilingCalc.tc_effective_high_str
|
||||
tc_headroom_str = ThroughputCeilingCalc.tc_headroom_str
|
||||
tc_objdet_gflops_str = ThroughputCeilingCalc.tc_objdet_gflops_str
|
||||
tc_objdet_sustained_str = ThroughputCeilingCalc.tc_objdet_sustained_str
|
||||
tc_objdet_headroom_str = ThroughputCeilingCalc.tc_objdet_headroom_str
|
||||
```
|
||||
|
||||
::: {.callout-notebook title="The Throughput Ceiling"}
|
||||
@@ -3944,50 +4011,68 @@ tc_objdet_headroom_str = fmt(tc_objdet_headroom_value, precision=0, commas=False
|
||||
from mlsys.formatting import fmt, check
|
||||
from mlsys.constants import KWS_DSCNN_PARAMS, KWS_DSCNN_FLOPs, Kparam, MFLOPs
|
||||
|
||||
# --- MobileNetV1 specs ---
|
||||
mnv1_params_m_value = 4.2 # millions of params
|
||||
mnv1_flops_mflops_value = 569 # MFLOPs at 224x224
|
||||
class WildlifeModelSizing:
|
||||
"""Select model architecture for constrained edge deployment: MobileNetV2 fits 512 MB."""
|
||||
|
||||
# --- MobileNetV2 (0.75x width) specs ---
|
||||
mnv2_params_m_value = 2.2 # millions of params
|
||||
mnv2_flops_mflops_value = 150 # MFLOPs at 224x224
|
||||
# ┌── 1. PARAMETERS (Inputs) ──────────────────────────────────────────────
|
||||
# MobileNetV1 specs
|
||||
mnv1_params_m = 4.2 # millions of params
|
||||
mnv1_flops_mflops = 569 # MFLOPs at 224x224
|
||||
|
||||
# --- Edge deployment power assumptions ---
|
||||
inference_power_mw_value = 200 # milliwatts during inference
|
||||
inference_latency_ms_value = 75 # ms per inference
|
||||
inferences_per_day_value = 100 # trigger-based
|
||||
# MobileNetV2 (0.75x width) specs
|
||||
mnv2_params_m = 2.2 # millions of params
|
||||
mnv2_flops_mflops = 150 # MFLOPs at 224x224
|
||||
|
||||
# --- Memory calculations ---
|
||||
mnv1_fp32_mb_value = mnv1_params_m_value * 4 # FP32: 4 bytes/param
|
||||
mnv1_int8_mb_value = mnv1_params_m_value * 1 # INT8: 1 byte/param
|
||||
mnv2_fp32_mb_value = mnv2_params_m_value * 4
|
||||
mnv2_int8_mb_value = mnv2_params_m_value * 1
|
||||
# Edge deployment power assumptions
|
||||
inference_power_mw = 200 # milliwatts during inference
|
||||
inference_latency_ms = 75 # ms per inference
|
||||
inferences_per_day = 100 # trigger-based
|
||||
|
||||
# --- KWS reference (too small for 50-species task) ---
|
||||
kws_example_params_k_value = KWS_DSCNN_PARAMS.to(Kparam).magnitude
|
||||
kws_example_flops_mflops_value = KWS_DSCNN_FLOPs.to(MFLOPs).magnitude
|
||||
# ┌── 2. CALCULATION (The Physics) ────────────────────────────────────────
|
||||
# Memory footprints
|
||||
mnv1_fp32_mb = mnv1_params_m * 4 # FP32: 4 bytes/param
|
||||
mnv1_int8_mb = mnv1_params_m * 1 # INT8: 1 byte/param
|
||||
mnv2_fp32_mb = mnv2_params_m * 4
|
||||
mnv2_int8_mb = mnv2_params_m * 1
|
||||
|
||||
# --- Energy calculations ---
|
||||
energy_per_inf_mj_value = (
|
||||
inference_power_mw_value * inference_latency_ms_value / 1000
|
||||
)
|
||||
energy_per_day_j_value = (
|
||||
inferences_per_day_value * energy_per_inf_mj_value / 1000
|
||||
)
|
||||
# KWS reference (too small for 50-species task)
|
||||
kws_example_params_k = KWS_DSCNN_PARAMS.m_as(Kparam)
|
||||
kws_example_flops_mflops = KWS_DSCNN_FLOPs.m_as(MFLOPs)
|
||||
|
||||
# --- Outputs (formatted strings for prose) ---
|
||||
mnv1_params_str = fmt(mnv1_params_m_value, precision=1, commas=False) # e.g. "4.2"
|
||||
mnv1_flops_str = fmt(mnv1_flops_mflops_value, precision=0, commas=False) # e.g. "569"
|
||||
mnv1_fp32_str = fmt(mnv1_fp32_mb_value, precision=0, commas=False) # e.g. "17"
|
||||
mnv1_int8_str = fmt(mnv1_int8_mb_value, precision=0, commas=False) # e.g. "4"
|
||||
mnv2_params_str = fmt(mnv2_params_m_value, precision=1, commas=False) # e.g. "2.2"
|
||||
mnv2_flops_str = fmt(mnv2_flops_mflops_value, precision=0, commas=False) # e.g. "150"
|
||||
mnv2_fp32_str = fmt(mnv2_fp32_mb_value, precision=0, commas=False) # e.g. "9"
|
||||
mnv2_int8_str = fmt(mnv2_int8_mb_value, precision=1, commas=False) # e.g. "2.2"
|
||||
kws_example_params_str = fmt(kws_example_params_k_value, precision=0, commas=False) # e.g. "26"
|
||||
kws_example_flops_str = fmt(kws_example_flops_mflops_value, precision=0, commas=False) # e.g. "6"
|
||||
energy_mj_str = fmt(energy_per_inf_mj_value, precision=0, commas=False) # e.g. "15"
|
||||
energy_j_str = fmt(energy_per_day_j_value, precision=1, commas=False) # e.g. "1.5"
|
||||
# Energy
|
||||
energy_per_inf_mj = inference_power_mw * inference_latency_ms / 1000
|
||||
energy_per_day_j = inferences_per_day * energy_per_inf_mj / 1000
|
||||
|
||||
# ┌── 3. INVARIANTS (Guardrails) ──────────────────────────────────────────
|
||||
check(mnv2_int8_mb < 512, "MobileNetV2 INT8 must fit in 512 MB edge RAM.")
|
||||
|
||||
# ┌── 4. OUTPUTS (Formatting) ─────────────────────────────────────────────
|
||||
mnv1_params_str = fmt(mnv1_params_m, precision=1, commas=False) # e.g. "4.2"
|
||||
mnv1_flops_str = fmt(mnv1_flops_mflops, precision=0, commas=False) # e.g. "569"
|
||||
mnv1_fp32_str = fmt(mnv1_fp32_mb, precision=0, commas=False) # e.g. "17"
|
||||
mnv1_int8_str = fmt(mnv1_int8_mb, precision=0, commas=False) # e.g. "4"
|
||||
mnv2_params_str = fmt(mnv2_params_m, precision=1, commas=False) # e.g. "2.2"
|
||||
mnv2_flops_str = fmt(mnv2_flops_mflops, precision=0, commas=False) # e.g. "150"
|
||||
mnv2_fp32_str = fmt(mnv2_fp32_mb, precision=0, commas=False) # e.g. "9"
|
||||
mnv2_int8_str = fmt(mnv2_int8_mb, precision=1, commas=False) # e.g. "2.2"
|
||||
kws_example_params_str = fmt(kws_example_params_k, precision=0, commas=False) # e.g. "26"
|
||||
kws_example_flops_str = fmt(kws_example_flops_mflops, precision=0, commas=False) # e.g. "6"
|
||||
energy_mj_str = fmt(energy_per_inf_mj, precision=0, commas=False) # e.g. "15"
|
||||
energy_j_str = fmt(energy_per_day_j, precision=1, commas=False) # e.g. "1.5"
|
||||
|
||||
# ┌── EXPORTS (Bridge to Text) ─────────────────────────────────────────────────
|
||||
mnv1_params_str = WildlifeModelSizing.mnv1_params_str
|
||||
mnv1_flops_str = WildlifeModelSizing.mnv1_flops_str
|
||||
mnv1_fp32_str = WildlifeModelSizing.mnv1_fp32_str
|
||||
mnv1_int8_str = WildlifeModelSizing.mnv1_int8_str
|
||||
mnv2_params_str = WildlifeModelSizing.mnv2_params_str
|
||||
mnv2_flops_str = WildlifeModelSizing.mnv2_flops_str
|
||||
mnv2_fp32_str = WildlifeModelSizing.mnv2_fp32_str
|
||||
mnv2_int8_str = WildlifeModelSizing.mnv2_int8_str
|
||||
kws_example_params_str = WildlifeModelSizing.kws_example_params_str
|
||||
kws_example_flops_str = WildlifeModelSizing.kws_example_flops_str
|
||||
energy_mj_str = WildlifeModelSizing.energy_mj_str
|
||||
energy_j_str = WildlifeModelSizing.energy_j_str
|
||||
```
|
||||
|
||||
With the throughput ceiling established, we can now apply the complete decision framework to a realistic scenario that exercises every step.
|
||||
@@ -4099,11 +4184,23 @@ Engineers add attention to CNNs or convolutions to Transformers expecting additi
|
||||
|
||||
from mlsys.constants import A100_MEM_CAPACITY, GiB
|
||||
|
||||
# --- 8-GPU cluster memory ---
|
||||
a100_8x_mem_value = int(A100_MEM_CAPACITY.to(GiB).magnitude) * 8
|
||||
class A100ClusterMemory:
|
||||
"""Contrast datacenter and edge memory: 8-GPU A100 node vs 4 GB edge device."""
|
||||
|
||||
# --- Outputs (formatted strings for prose) ---
|
||||
a100_8x_mem_str = f"{a100_8x_mem_value}" # e.g. "640"
|
||||
# ┌── 1. PARAMETERS (Inputs) ──────────────────────────────────────────────
|
||||
n_gpus = 8
|
||||
|
||||
# ┌── 2. CALCULATION (The Physics) ────────────────────────────────────────
|
||||
a100_8x_mem = int(A100_MEM_CAPACITY.m_as(GiB)) * n_gpus
|
||||
|
||||
# ┌── 3. INVARIANTS (Guardrails) ──────────────────────────────────────────
|
||||
check(a100_8x_mem > 400, "8x A100 cluster should provide >400 GiB memory.")
|
||||
|
||||
# ┌── 4. OUTPUTS (Formatting) ─────────────────────────────────────────────
|
||||
a100_8x_mem_str = f"{a100_8x_mem}" # e.g. "640"
|
||||
|
||||
# ┌── EXPORTS (Bridge to Text) ─────────────────────────────────────────────────
|
||||
a100_8x_mem_str = A100ClusterMemory.a100_8x_mem_str
|
||||
```
|
||||
|
||||
**Pitfall:** *Optimizing architectural decisions for training hardware without considering deployment constraints.*
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -26,7 +26,6 @@ start_chapter("vol1:model_compression")
|
||||
|
||||
:::
|
||||
|
||||
|
||||
## Purpose {.unnumbered}
|
||||
|
||||
\begin{marginfigure}
|
||||
@@ -78,102 +77,137 @@ Bridging that gap requires a systematic discipline of *compression*: trading cap
|
||||
from mlsys.constants import *
|
||||
from mlsys.formatting import fmt, check, sci
|
||||
|
||||
# --- Inputs (GPU specs) ---
|
||||
a100_tflops_fp16_value = A100_FLOPS_FP16_TENSOR.to(TFLOPs / second).magnitude
|
||||
a100_tflops_int8_value = A100_FLOPS_INT8.to(TFLOPs / second).magnitude
|
||||
a100_bw_tbs_value = A100_MEM_BW.to(TB / second).magnitude
|
||||
a100_int8_speedup_value = int(a100_tflops_int8_value / a100_tflops_fp16_value)
|
||||
class CompressionSetup:
|
||||
"""Chapter-wide constants: GPU specs, energy physics, model sizes, device constraints."""
|
||||
|
||||
# --- Inputs (energy/perf illustrative values) ---
|
||||
int8_energy_reduction_value = 20
|
||||
mobilenet_int8_mj_value = 47
|
||||
mobilenet_fp32_mj_value = 312
|
||||
tpu_v4_tops_per_w_value = 0.9
|
||||
v100_tops_per_w_value = 0.3
|
||||
bandwidth_bound_speedup_value = 4
|
||||
# ┌── 1. PARAMETERS (Inputs) ──────────────────────────────────────────────
|
||||
# Illustrative energy/perf values
|
||||
int8_energy_reduction = 20
|
||||
mobilenet_int8_mj = 47
|
||||
mobilenet_fp32_mj = 312
|
||||
tpu_v4_tops_per_w = 0.9
|
||||
v100_tops_per_w = 0.3
|
||||
bandwidth_bound_speedup = 4
|
||||
llm_7b_params = 7
|
||||
gpt3_training_flops_exp = 23
|
||||
|
||||
# --- Inputs (energy: multiply-add operations from constants) ---
|
||||
energy_dram_value = ENERGY_DRAM_ACCESS_PJ.magnitude
|
||||
energy_dram_per_byte_value = ENERGY_DRAM_PJ_PER_BYTE.magnitude
|
||||
energy_flop_fp32_value = ENERGY_FLOP_FP32_PJ.magnitude
|
||||
energy_flop_int8_value = ENERGY_FLOP_INT8_PJ.magnitude
|
||||
# ┌── 2. CALCULATION (The Physics) ────────────────────────────────────────
|
||||
# A100 specs
|
||||
a100_tflops_fp16 = A100_FLOPS_FP16_TENSOR.m_as(TFLOPs / second)
|
||||
a100_tflops_int8 = A100_FLOPS_INT8.m_as(TFLOPs / second)
|
||||
a100_bw_tbs = A100_MEM_BW.m_as(TB / second)
|
||||
a100_int8_speedup = int(a100_tflops_int8 / a100_tflops_fp16)
|
||||
|
||||
# Energy for addition operations (Horowitz 2014, 45nm process)
|
||||
energy_add_fp32_pj_value = ENERGY_ADD_FP32_PJ.to(ureg.picojoule).magnitude
|
||||
energy_add_fp16_pj_value = ENERGY_ADD_FP16_PJ.to(ureg.picojoule).magnitude
|
||||
energy_add_int32_pj_value = ENERGY_ADD_INT32_PJ.to(ureg.picojoule).magnitude
|
||||
energy_add_int8_pj_value = ENERGY_ADD_INT8_PJ.to(ureg.picojoule).magnitude
|
||||
energy_mul_fp32_pj_value = ENERGY_FLOP_FP32_PJ.magnitude
|
||||
# Energy from constants (Horowitz 2014, 45nm process)
|
||||
energy_dram = ENERGY_DRAM_ACCESS_PJ.m_as(ureg.picojoule)
|
||||
energy_dram_per_byte = ENERGY_DRAM_PJ_PER_BYTE.m_as(ureg.picojoule / ureg.byte)
|
||||
energy_flop_fp32 = ENERGY_FLOP_FP32_PJ.m_as(ureg.picojoule / ureg.count)
|
||||
energy_flop_int8 = ENERGY_FLOP_INT8_PJ.m_as(ureg.picojoule / ureg.count)
|
||||
energy_add_fp32_pj = ENERGY_ADD_FP32_PJ.m_as(ureg.picojoule)
|
||||
energy_add_fp16_pj = ENERGY_ADD_FP16_PJ.m_as(ureg.picojoule)
|
||||
energy_add_int32_pj = ENERGY_ADD_INT32_PJ.m_as(ureg.picojoule)
|
||||
energy_add_int8_pj = ENERGY_ADD_INT8_PJ.m_as(ureg.picojoule)
|
||||
energy_mul_fp32_pj = ENERGY_FLOP_FP32_PJ.m_as(ureg.picojoule / ureg.count)
|
||||
|
||||
# INT8 vs FP32 energy ratio (MAC-to-MAC: multiply + add for each precision)
|
||||
fp32_mac_pj_value = energy_mul_fp32_pj_value + energy_add_fp32_pj_value # 3.7 + 0.9 = 4.6 pJ
|
||||
int8_mac_pj_value = energy_flop_int8_value + energy_add_int8_pj_value # 0.2 + 0.03 = 0.23 pJ
|
||||
int8_fp32_energy_ratio_value = fp32_mac_pj_value / int8_mac_pj_value
|
||||
# INT8 vs FP32 MAC energy ratio
|
||||
fp32_mac_pj = energy_mul_fp32_pj + energy_add_fp32_pj # 3.7 + 0.9 = 4.6 pJ
|
||||
int8_mac_pj = energy_flop_int8 + energy_add_int8_pj # 0.2 + 0.03 = 0.23 pJ
|
||||
int8_fp32_energy_ratio = fp32_mac_pj / int8_mac_pj
|
||||
|
||||
# V100 specs
|
||||
v100_bw_gbs_value = V100_MEM_BW.to(GB / second).magnitude
|
||||
v100_tflops_fp32_value = V100_FLOPS_FP32.to(TFLOPs / second).magnitude
|
||||
# V100 specs
|
||||
v100_bw_gbs = V100_MEM_BW.m_as(GB / second)
|
||||
v100_tflops_fp32 = V100_FLOPS_FP32.m_as(TFLOPs / second)
|
||||
|
||||
# Model specs
|
||||
resnet_params_m_value = RESNET50_PARAMS.to(Mparam).magnitude
|
||||
resnet_gflops_value = RESNET50_FLOPs.to(GFLOPs).magnitude
|
||||
mobilenetv2_mflops_value = MOBILENETV2_FLOPs.to(GFLOPs).magnitude * 1000
|
||||
# Model specs
|
||||
resnet_params_m = RESNET50_PARAMS.m_as(Mparam)
|
||||
resnet_gflops = RESNET50_FLOPs.m_as(GFLOPs)
|
||||
mobilenetv2_mflops = MOBILENETV2_FLOPs.m_as(GFLOPs) * 1000
|
||||
|
||||
# LLM parameter/memory calculations
|
||||
llm_7b_params_value = 7
|
||||
llm_7b_mem_fp16_gb_value = llm_7b_params_value * 2
|
||||
llm_175b_params_value = GPT3_PARAMS.to(Bparam).magnitude
|
||||
llm_175b_mem_fp16_gb_value = llm_175b_params_value * 2
|
||||
# LLM memory
|
||||
llm_7b_mem_fp16_gb = llm_7b_params * 2
|
||||
llm_175b_params = GPT3_PARAMS.m_as(Bparam)
|
||||
llm_175b_mem_fp16_gb = llm_175b_params * 2
|
||||
|
||||
# Device memory constraints
|
||||
smartphone_ram_gb_value = SMARTPHONE_RAM_GB.to(GB).magnitude
|
||||
mcu_ram_kb_value = MCU_RAM_KIB.to(KiB).magnitude
|
||||
# Device memory
|
||||
smartphone_ram_gb = SMARTPHONE_RAM_GB.m_as(GB)
|
||||
mcu_ram_kb = MCU_RAM_KIB.m_as(KiB)
|
||||
|
||||
# GPT-3 training FLOPs
|
||||
gpt3_training_flops_exp_value = 23
|
||||
# ┌── 3. INVARIANTS (Guardrails) ──────────────────────────────────────────
|
||||
check(a100_int8_speedup >= 2, "A100 INT8 should be at least 2x faster than FP16.")
|
||||
check(int8_fp32_energy_ratio > 1, "FP32 MAC must cost more energy than INT8 MAC.")
|
||||
|
||||
# --- Outputs (formatted strings for prose) ---
|
||||
a100_tflops_fp16_str = fmt(a100_tflops_fp16_value, precision=0, commas=False)
|
||||
a100_tflops_int8_str = fmt(a100_tflops_int8_value, precision=0, commas=False)
|
||||
a100_bw_tbs_str = fmt(a100_bw_tbs_value, precision=1, commas=False)
|
||||
a100_int8_speedup_str = fmt(a100_int8_speedup_value, precision=0, commas=False)
|
||||
int8_energy_reduction_str = fmt(int8_energy_reduction_value, precision=0, commas=False)
|
||||
mobilenet_int8_mj_str = fmt(mobilenet_int8_mj_value, precision=0, commas=False)
|
||||
mobilenet_fp32_mj_str = fmt(mobilenet_fp32_mj_value, precision=0, commas=False)
|
||||
tpu_v4_tops_per_w_str = fmt(tpu_v4_tops_per_w_value, precision=1, commas=False)
|
||||
v100_tops_per_w_str = fmt(v100_tops_per_w_value, precision=1, commas=False)
|
||||
bandwidth_bound_speedup_str = fmt(bandwidth_bound_speedup_value, precision=0, commas=False)
|
||||
# ┌── 4. OUTPUTS (Formatting) ─────────────────────────────────────────────
|
||||
a100_tflops_fp16_str = fmt(a100_tflops_fp16, precision=0, commas=False)
|
||||
a100_tflops_int8_str = fmt(a100_tflops_int8, precision=0, commas=False)
|
||||
a100_bw_tbs_str = fmt(a100_bw_tbs, precision=1, commas=False)
|
||||
a100_int8_speedup_str = fmt(a100_int8_speedup, precision=0, commas=False)
|
||||
int8_energy_reduction_str = fmt(int8_energy_reduction, precision=0, commas=False)
|
||||
mobilenet_int8_mj_str = fmt(mobilenet_int8_mj, precision=0, commas=False)
|
||||
mobilenet_fp32_mj_str = fmt(mobilenet_fp32_mj, precision=0, commas=False)
|
||||
tpu_v4_tops_per_w_str = fmt(tpu_v4_tops_per_w, precision=1, commas=False)
|
||||
v100_tops_per_w_str = fmt(v100_tops_per_w, precision=1, commas=False)
|
||||
bandwidth_bound_speedup_str = fmt(bandwidth_bound_speedup, precision=0, commas=False)
|
||||
energy_dram_str = fmt(energy_dram, precision=0, commas=False)
|
||||
energy_dram_per_byte_str = fmt(energy_dram_per_byte, precision=0, commas=False)
|
||||
energy_flop_fp32_str = f"{energy_flop_fp32}"
|
||||
energy_flop_int8_str = f"{energy_flop_int8}"
|
||||
energy_add_fp32_str = f"{energy_add_fp32_pj}"
|
||||
energy_add_fp16_str = f"{energy_add_fp16_pj}"
|
||||
energy_add_int32_str = f"{energy_add_int32_pj}"
|
||||
energy_add_int8_str = f"{energy_add_int8_pj}"
|
||||
energy_mul_fp32_str = f"{energy_mul_fp32_pj}"
|
||||
int8_fp32_energy_ratio_str = fmt(int8_fp32_energy_ratio, precision=1, commas=False)
|
||||
v100_bw_gbs_str = fmt(v100_bw_gbs, precision=0, commas=False)
|
||||
v100_tflops_fp32_str = fmt(v100_tflops_fp32, precision=1, commas=False)
|
||||
resnet_params_m_str = fmt(resnet_params_m, precision=1, commas=False)
|
||||
resnet_gflops_str = fmt(resnet_gflops, precision=1, commas=False)
|
||||
mobilenetv2_mflops_str = fmt(mobilenetv2_mflops, precision=0, commas=False)
|
||||
llm_7b_str = f"{llm_7b_params}"
|
||||
llm_7b_mem_str = fmt(llm_7b_mem_fp16_gb, precision=0, commas=False)
|
||||
llm_175b_str = fmt(llm_175b_params, precision=0, commas=False)
|
||||
llm_175b_mem_str = fmt(llm_175b_mem_fp16_gb, precision=0, commas=False)
|
||||
smartphone_ram_str = f"{smartphone_ram_gb}"
|
||||
mcu_ram_str = f"{mcu_ram_kb}"
|
||||
gpt3_training_flops_str = f"$3.14 \\times 10^{{{gpt3_training_flops_exp}}}$"
|
||||
|
||||
energy_dram_str = fmt(energy_dram_value, precision=0, commas=False)
|
||||
energy_dram_per_byte_str = fmt(energy_dram_per_byte_value, precision=0, commas=False)
|
||||
energy_flop_fp32_str = f"{energy_flop_fp32_value}"
|
||||
energy_flop_int8_str = f"{energy_flop_int8_value}"
|
||||
|
||||
energy_add_fp32_str = f"{energy_add_fp32_pj_value}"
|
||||
energy_add_fp16_str = f"{energy_add_fp16_pj_value}"
|
||||
energy_add_int32_str = f"{energy_add_int32_pj_value}"
|
||||
energy_add_int8_str = f"{energy_add_int8_pj_value}"
|
||||
energy_mul_fp32_str = f"{energy_mul_fp32_pj_value}"
|
||||
|
||||
int8_fp32_energy_ratio_str = fmt(int8_fp32_energy_ratio_value, precision=1, commas=False)
|
||||
|
||||
v100_bw_gbs_str = fmt(v100_bw_gbs_value, precision=0, commas=False)
|
||||
v100_tflops_fp32_str = fmt(v100_tflops_fp32_value, precision=1, commas=False)
|
||||
|
||||
resnet_params_m_str = fmt(resnet_params_m_value, precision=1, commas=False)
|
||||
resnet_gflops_str = fmt(resnet_gflops_value, precision=1, commas=False)
|
||||
mobilenetv2_mflops_str = fmt(mobilenetv2_mflops_value, precision=0, commas=False)
|
||||
|
||||
llm_7b_str = f"{llm_7b_params_value}"
|
||||
llm_7b_mem_str = fmt(llm_7b_mem_fp16_gb_value, precision=0, commas=False)
|
||||
llm_175b_str = fmt(llm_175b_params_value, precision=0, commas=False)
|
||||
llm_175b_mem_str = fmt(llm_175b_mem_fp16_gb_value, precision=0, commas=False)
|
||||
smartphone_ram_str = f"{smartphone_ram_gb_value}"
|
||||
mcu_ram_str = f"{mcu_ram_kb_value}"
|
||||
gpt3_training_flops_str = f"$3.14 \\times 10^{{{gpt3_training_flops_exp_value}}}$"
|
||||
# ┌── EXPORTS (Bridge to Text) ─────────────────────────────────────────────────
|
||||
a100_tflops_fp16_str = CompressionSetup.a100_tflops_fp16_str
|
||||
a100_tflops_int8_str = CompressionSetup.a100_tflops_int8_str
|
||||
a100_bw_tbs_str = CompressionSetup.a100_bw_tbs_str
|
||||
a100_int8_speedup_str = CompressionSetup.a100_int8_speedup_str
|
||||
int8_energy_reduction_str = CompressionSetup.int8_energy_reduction_str
|
||||
mobilenet_int8_mj_str = CompressionSetup.mobilenet_int8_mj_str
|
||||
mobilenet_fp32_mj_str = CompressionSetup.mobilenet_fp32_mj_str
|
||||
tpu_v4_tops_per_w_str = CompressionSetup.tpu_v4_tops_per_w_str
|
||||
v100_tops_per_w_str = CompressionSetup.v100_tops_per_w_str
|
||||
bandwidth_bound_speedup_str = CompressionSetup.bandwidth_bound_speedup_str
|
||||
energy_dram_str = CompressionSetup.energy_dram_str
|
||||
energy_dram_per_byte_str = CompressionSetup.energy_dram_per_byte_str
|
||||
energy_flop_fp32_str = CompressionSetup.energy_flop_fp32_str
|
||||
energy_flop_int8_str = CompressionSetup.energy_flop_int8_str
|
||||
energy_add_fp32_str = CompressionSetup.energy_add_fp32_str
|
||||
energy_add_fp16_str = CompressionSetup.energy_add_fp16_str
|
||||
energy_add_int32_str = CompressionSetup.energy_add_int32_str
|
||||
energy_add_int8_str = CompressionSetup.energy_add_int8_str
|
||||
energy_mul_fp32_str = CompressionSetup.energy_mul_fp32_str
|
||||
int8_fp32_energy_ratio_str = CompressionSetup.int8_fp32_energy_ratio_str
|
||||
v100_bw_gbs_str = CompressionSetup.v100_bw_gbs_str
|
||||
v100_tflops_fp32_str = CompressionSetup.v100_tflops_fp32_str
|
||||
resnet_params_m_str = CompressionSetup.resnet_params_m_str
|
||||
resnet_gflops_str = CompressionSetup.resnet_gflops_str
|
||||
mobilenetv2_mflops_str = CompressionSetup.mobilenetv2_mflops_str
|
||||
llm_7b_str = CompressionSetup.llm_7b_str
|
||||
llm_7b_mem_str = CompressionSetup.llm_7b_mem_str
|
||||
llm_175b_str = CompressionSetup.llm_175b_str
|
||||
llm_175b_mem_str = CompressionSetup.llm_175b_mem_str
|
||||
smartphone_ram_str = CompressionSetup.smartphone_ram_str
|
||||
mcu_ram_str = CompressionSetup.mcu_ram_str
|
||||
gpt3_training_flops_str = CompressionSetup.gpt3_training_flops_str
|
||||
# Note: v100_bw_gbs_value used by downstream fusion-calc cell
|
||||
v100_bw_gbs_value = CompressionSetup.v100_bw_gbs
|
||||
v100_tflops_fp32_value = CompressionSetup.v100_tflops_fp32
|
||||
```
|
||||
|
||||
|
||||
## Optimization Framework {#sec-model-compression-optimization-framework-9e21}
|
||||
|
||||
A `{python} llm_7b_str`-billion parameter language model requires `{python} llm_7b_mem_str` GB just to store its weights in FP16. Your deployment target is a smartphone with `{python} smartphone_ram_str` GB of RAM shared across the operating system, applications, and your model. *The math does not work.* No amount of clever engineering changes this arithmetic: `{python} llm_7b_mem_str` GB cannot fit in `{python} smartphone_ram_str` GB. Yet users expect the model to run: responsively, offline, without draining their battery in an hour. The gap between what training produces and what deployment permits (the Latency Budget, the maximum allowable end-to-end inference time, defined formally in @sec-model-serving) is not a minor inconvenience but a defining challenge of model compression.
|
||||
@@ -420,7 +454,6 @@ We call this phenomenon *the quantization speedup*.
|
||||
|
||||
The relative importance of each dimension varies by deployment target. Cloud systems may tolerate larger models but demand throughput; mobile devices prioritize memory and energy; embedded systems face hard constraints on all resources simultaneously. Understanding these deployment contexts shapes which optimization dimensions to prioritize.
|
||||
|
||||
|
||||
## Deployment Context {#sec-model-compression-deployment-context-0d88}
|
||||
|
||||
The optimization framework above identifies three dimensions of compression, but which dimensions matter most depends entirely on where the model will run. A datacenter GPU with 80 GB of HBM faces different binding constraints than a smartphone with shared RAM or a microcontroller with 256 KB of SRAM. @tbl-deployment-scenarios summarizes the key constraints across deployment environments.
|
||||
@@ -482,55 +515,80 @@ from mlsys.constants import (GB, GiB, MiB, KiB, MB, KB, byte,
|
||||
CLOUD_MEM_GIB, MOBILE_MEM_GIB, TINY_MEM_KIB,
|
||||
DLRM_MODEL_SIZE_FP32)
|
||||
|
||||
# --- Inputs (device capacities and model sizes) ---
|
||||
cloud_mem_value = CLOUD_MEM_GIB
|
||||
mobile_mem_value = MOBILE_MEM_GIB
|
||||
tiny_mem_value = TINY_MEM_KIB
|
||||
|
||||
dlrm_mem_value = DLRM_MODEL_SIZE_FP32
|
||||
gpt2_mem_value = 6 * GiB
|
||||
resnet_mem_value = 100 * MiB
|
||||
mobilenet_mem_value = 14 * MiB
|
||||
mobilenet_int8_mem_value = 3.5 * MiB
|
||||
dscnn_mem_value = 500 * KiB
|
||||
|
||||
# --- Process (compute fit ratios) ---
|
||||
def get_ratio(model_mem, device_mem):
|
||||
ratio = model_mem.to(byte).magnitude / device_mem.to(byte).magnitude
|
||||
def _get_ratio(model_mem, device_mem):
|
||||
"""Return 'ok' if model fits, else 'no (Nx)' with how many times it overflows."""
|
||||
ratio = model_mem.m_as(byte) / device_mem.m_as(byte)
|
||||
if ratio < 1:
|
||||
return "ok"
|
||||
return f"no ({ratio:.0f}x)"
|
||||
|
||||
dlrm_mobile_value = get_ratio(dlrm_mem_value, mobile_mem_value)
|
||||
dlrm_tiny_value = get_ratio(dlrm_mem_value, tiny_mem_value)
|
||||
class ModelDeviceComparison:
|
||||
"""Contrast model requirements with device memory: 6-order-of-magnitude deployment gap."""
|
||||
|
||||
gpt2_mobile_value = get_ratio(gpt2_mem_value, mobile_mem_value)
|
||||
gpt2_tiny_value = get_ratio(gpt2_mem_value, tiny_mem_value)
|
||||
# ┌── 1. PARAMETERS (Inputs) ──────────────────────────────────────────────
|
||||
# Device capacities
|
||||
cloud_mem = CLOUD_MEM_GIB
|
||||
mobile_mem = MOBILE_MEM_GIB
|
||||
tiny_mem = TINY_MEM_KIB
|
||||
|
||||
resnet_tiny_value = get_ratio(resnet_mem_value, tiny_mem_value)
|
||||
mobilenet_tiny_value = get_ratio(mobilenet_mem_value, tiny_mem_value)
|
||||
mobilenet_int8_tiny_value = get_ratio(mobilenet_int8_mem_value, tiny_mem_value)
|
||||
# Model sizes
|
||||
dlrm_mem = DLRM_MODEL_SIZE_FP32
|
||||
gpt2_mem = 6 * GiB
|
||||
resnet_mem = 100 * MiB
|
||||
mobilenet_mem = 14 * MiB
|
||||
mobilenet_int8_mem = 3.5 * MiB
|
||||
dscnn_mem = 500 * KiB
|
||||
|
||||
# --- Outputs (formatted strings for prose) ---
|
||||
dlrm_str = f"{dlrm_mem_value.to(GB).magnitude:.0f} GB"
|
||||
gpt2_str = f"{gpt2_mem_value.to(GiB).magnitude:.0f} GB"
|
||||
resnet_str = f"{resnet_mem_value.to(MiB).magnitude:.0f} MB"
|
||||
mobilenet_str = f"{mobilenet_mem_value.to(MiB).magnitude:.0f} MB"
|
||||
mobilenet_int8_str = f"{mobilenet_int8_mem_value.to(MiB).magnitude:.1f} MB"
|
||||
dscnn_str = f"{dscnn_mem_value.to(KiB).magnitude:.0f} KB"
|
||||
# ┌── 2. CALCULATION (The Physics) ────────────────────────────────────────
|
||||
dlrm_mobile = _get_ratio(dlrm_mem, mobile_mem)
|
||||
dlrm_tiny = _get_ratio(dlrm_mem, tiny_mem)
|
||||
gpt2_mobile = _get_ratio(gpt2_mem, mobile_mem)
|
||||
gpt2_tiny = _get_ratio(gpt2_mem, tiny_mem)
|
||||
resnet_tiny = _get_ratio(resnet_mem, tiny_mem)
|
||||
mobilenet_tiny = _get_ratio(mobilenet_mem, tiny_mem)
|
||||
mobilenet_int8_tiny = _get_ratio(mobilenet_int8_mem, tiny_mem)
|
||||
|
||||
cloud_cap_str = f"~{cloud_mem_value.to(GiB).magnitude:.0f} GB"
|
||||
mobile_cap_str = f"~{mobile_mem_value.to(GiB).magnitude:.0f} GB"
|
||||
tiny_cap_str = f"~{tiny_mem_value.to(KiB).magnitude:.0f} KB"
|
||||
# ┌── 3. INVARIANTS (Guardrails) ──────────────────────────────────────────
|
||||
# DS-CNN always fits TinyML — sanity check
|
||||
assert _get_ratio(dscnn_mem, tiny_mem) == "ok", "DS-CNN must fit in TinyML device."
|
||||
|
||||
dlrm_mobile_str = dlrm_mobile_value
|
||||
dlrm_tiny_str = dlrm_tiny_value
|
||||
gpt2_mobile_str = gpt2_mobile_value
|
||||
gpt2_tiny_str = gpt2_tiny_value
|
||||
resnet_tiny_str = resnet_tiny_value
|
||||
mobilenet_tiny_str = mobilenet_tiny_value
|
||||
mobilenet_int8_tiny_str = mobilenet_int8_tiny_value
|
||||
dscnn_tiny_str = "ok"
|
||||
# ┌── 4. OUTPUTS (Formatting) ─────────────────────────────────────────────
|
||||
dlrm_str = f"{dlrm_mem.m_as(GB):.0f} GB"
|
||||
gpt2_str = f"{gpt2_mem.m_as(GiB):.0f} GB"
|
||||
resnet_str = f"{resnet_mem.m_as(MiB):.0f} MB"
|
||||
mobilenet_str = f"{mobilenet_mem.m_as(MiB):.0f} MB"
|
||||
mobilenet_int8_str = f"{mobilenet_int8_mem.m_as(MiB):.1f} MB"
|
||||
dscnn_str = f"{dscnn_mem.m_as(KiB):.0f} KB"
|
||||
cloud_cap_str = f"~{cloud_mem.m_as(GiB):.0f} GB"
|
||||
mobile_cap_str = f"~{mobile_mem.m_as(GiB):.0f} GB"
|
||||
tiny_cap_str = f"~{tiny_mem.m_as(KiB):.0f} KB"
|
||||
dlrm_mobile_str = dlrm_mobile
|
||||
dlrm_tiny_str = dlrm_tiny
|
||||
gpt2_mobile_str = gpt2_mobile
|
||||
gpt2_tiny_str = gpt2_tiny
|
||||
resnet_tiny_str = resnet_tiny
|
||||
mobilenet_tiny_str = mobilenet_tiny
|
||||
mobilenet_int8_tiny_str = mobilenet_int8_tiny
|
||||
dscnn_tiny_str = "ok"
|
||||
|
||||
# ┌── EXPORTS (Bridge to Text) ─────────────────────────────────────────────────
|
||||
dlrm_str = ModelDeviceComparison.dlrm_str
|
||||
gpt2_str = ModelDeviceComparison.gpt2_str
|
||||
resnet_str = ModelDeviceComparison.resnet_str
|
||||
mobilenet_str = ModelDeviceComparison.mobilenet_str
|
||||
mobilenet_int8_str = ModelDeviceComparison.mobilenet_int8_str
|
||||
dscnn_str = ModelDeviceComparison.dscnn_str
|
||||
cloud_cap_str = ModelDeviceComparison.cloud_cap_str
|
||||
mobile_cap_str = ModelDeviceComparison.mobile_cap_str
|
||||
tiny_cap_str = ModelDeviceComparison.tiny_cap_str
|
||||
dlrm_mobile_str = ModelDeviceComparison.dlrm_mobile_str
|
||||
dlrm_tiny_str = ModelDeviceComparison.dlrm_tiny_str
|
||||
gpt2_mobile_str = ModelDeviceComparison.gpt2_mobile_str
|
||||
gpt2_tiny_str = ModelDeviceComparison.gpt2_tiny_str
|
||||
resnet_tiny_str = ModelDeviceComparison.resnet_tiny_str
|
||||
mobilenet_tiny_str = ModelDeviceComparison.mobilenet_tiny_str
|
||||
mobilenet_int8_tiny_str = ModelDeviceComparison.mobilenet_int8_tiny_str
|
||||
dscnn_tiny_str = ModelDeviceComparison.dscnn_tiny_str
|
||||
```
|
||||
|
||||
| **Model** | **Memory** **(Runtime)** | **Storage** **(Weights)** | **Cloud** **(`{python} cloud_cap_str`)** | **Mobile** **(`{python} mobile_cap_str`)** | **TinyML** **(`{python} tiny_cap_str`)** |
|
||||
@@ -600,7 +658,6 @@ Optimization is about trading one resource for another.
|
||||
|
||||
Each deployment context above imposes a binding constraint: memory capacity on mobile devices, latency on real-time systems, energy on battery-powered sensors. The optimization techniques that follow address these constraints at three successive levels of the stack. We begin with structural methods that modify *what* computations occur, reducing the model's parameter count and operation count to fit tighter memory and compute budgets. We then turn to precision techniques that reduce how many bits represent each value, directly shrinking memory footprint and accelerating arithmetic. Finally, we address architectural approaches that improve how efficiently the remaining operations execute on physical hardware, closing the gap between theoretical savings and measured performance.
|
||||
|
||||
|
||||
## Structural Optimization {#sec-model-compression-structural-optimization-ee93}
|
||||
\index{Model Compression!structural optimization}
|
||||
|
||||
@@ -2764,7 +2821,6 @@ Test your understanding of the structural optimization techniques covered so far
|
||||
- [ ] Can you identify when to choose Neural Architecture Search over manual architecture design? Consider the trade-offs in computational cost, design space coverage, and hardware-specific optimization.
|
||||
:::
|
||||
|
||||
|
||||
## Quantization and Precision {#sec-model-compression-quantization-precision-cd46}
|
||||
\index{Model Compression!precision optimization}
|
||||
|
||||
@@ -3690,44 +3746,57 @@ Compare the two mapping diagrams side by side in @fig-calibration-ranges. Symmet
|
||||
# │ zero_point_str, x_val_str, x_q_str, x_recon_str
|
||||
# └─────────────────────────────────────────────────────────────────────────────
|
||||
from mlsys.formatting import fmt, check
|
||||
from mlsys.constants import KIB_TO_BYTES
|
||||
|
||||
# --- Inputs (activation range example) ---
|
||||
alpha_value = -1.0
|
||||
beta_value = 3.0
|
||||
bits_value = 8
|
||||
x_val_value = 0.0 # value to quantize
|
||||
class QuantizationMathCalc:
|
||||
"""Derive affine quantization parameters: scale and zero-point for [-1.0, 3.0] → UINT8."""
|
||||
|
||||
# --- Process (calculate affine parameters) ---
|
||||
# 1. Calculate Scale (s)
|
||||
# s = (beta - alpha) / (2^b - 1)
|
||||
int_steps_value = 2**bits_value - 1
|
||||
scale_value = (beta_value - alpha_value) / int_steps_value
|
||||
# ┌── 1. PARAMETERS (Inputs) ──────────────────────────────────────────────
|
||||
alpha = -1.0 # activation range min
|
||||
beta = 3.0 # activation range max
|
||||
bits = 8 # target bit-width
|
||||
x_val = 0.0 # value to quantize
|
||||
|
||||
# 2. Calculate Zero-Point (z)
|
||||
# z = round(-alpha / s)
|
||||
# Note: z maps the real value 0.0 to an integer
|
||||
zero_point_value = round(-alpha_value / scale_value)
|
||||
# ┌── 2. CALCULATION (The Physics) ────────────────────────────────────────
|
||||
# 1. Scale: s = (beta - alpha) / (2^b - 1)
|
||||
int_steps = 2**bits - 1
|
||||
scale = (beta - alpha) / int_steps
|
||||
|
||||
# 3. Quantize a value
|
||||
# x_q = clamp(round(x / s) + z, 0, 2^b - 1)
|
||||
x_q_raw = round(x_val_value / scale_value) + zero_point_value
|
||||
x_q_value = max(0, min(int_steps_value, x_q_raw))
|
||||
# 2. Zero-point: z = round(-alpha / s)
|
||||
zero_point = round(-alpha / scale)
|
||||
|
||||
# 4. Dequantize (reconstruct)
|
||||
# x_recon = (x_q - z) * s
|
||||
x_recon_value = (x_q_value - zero_point_value) * scale_value
|
||||
# 3. Quantize: x_q = clamp(round(x/s) + z, 0, 2^b - 1)
|
||||
x_q_raw = round(x_val / scale) + zero_point
|
||||
x_q = max(0, min(int_steps, x_q_raw))
|
||||
|
||||
# --- Outputs (formatted strings for prose) ---
|
||||
alpha_str = fmt(alpha_value, precision=1, commas=False) # "-1.0"
|
||||
beta_str = fmt(beta_value, precision=1, commas=False) # "3.0"
|
||||
range_str = fmt(beta_value - alpha_value, precision=1, commas=False) # "4.0"
|
||||
steps_str = f"{int_steps_value}" # "255"
|
||||
scale_str = fmt(scale_value, precision=4, commas=False) # "0.0157"
|
||||
zero_point_str = f"{int(zero_point_value)}" # "64"
|
||||
x_val_str = fmt(x_val_value, precision=1, commas=False) # "0.0"
|
||||
x_q_str = f"{int(x_q_value)}" # "64"
|
||||
x_recon_str = fmt(x_recon_value, precision=2, commas=False) # "0.00"
|
||||
# 4. Dequantize: x_recon = (x_q - z) * s
|
||||
x_recon = (x_q - zero_point) * scale
|
||||
|
||||
# ┌── 3. INVARIANTS (Guardrails) ──────────────────────────────────────────
|
||||
check(scale > 0, "Scale must be positive.")
|
||||
check(0 <= zero_point <= int_steps, "Zero-point must be in valid integer range.")
|
||||
check(abs(x_recon - x_val) < scale, "Reconstruction error must be less than one step size.")
|
||||
|
||||
# ┌── 4. OUTPUTS (Formatting) ─────────────────────────────────────────────
|
||||
alpha_str = fmt(alpha, precision=1, commas=False) # "-1.0"
|
||||
beta_str = fmt(beta, precision=1, commas=False) # "3.0"
|
||||
range_str = fmt(beta - alpha, precision=1, commas=False) # "4.0"
|
||||
steps_str = f"{int_steps}" # "255"
|
||||
scale_str = fmt(scale, precision=4, commas=False) # "0.0157"
|
||||
zero_point_str = f"{int(zero_point)}" # "64"
|
||||
x_val_str = fmt(x_val, precision=1, commas=False) # "0.0"
|
||||
x_q_str = f"{int(x_q)}" # "64"
|
||||
x_recon_str = fmt(x_recon, precision=2, commas=False) # "0.00"
|
||||
|
||||
# ┌── EXPORTS (Bridge to Text) ─────────────────────────────────────────────────
|
||||
alpha_str = QuantizationMathCalc.alpha_str
|
||||
beta_str = QuantizationMathCalc.beta_str
|
||||
range_str = QuantizationMathCalc.range_str
|
||||
steps_str = QuantizationMathCalc.steps_str
|
||||
scale_str = QuantizationMathCalc.scale_str
|
||||
zero_point_str = QuantizationMathCalc.zero_point_str
|
||||
x_val_str = QuantizationMathCalc.x_val_str
|
||||
x_q_str = QuantizationMathCalc.x_q_str
|
||||
x_recon_str = QuantizationMathCalc.x_recon_str
|
||||
```
|
||||
|
||||
::: {.callout-notebook title="Calculating Scale and Zero-Point"}
|
||||
@@ -4326,7 +4395,6 @@ Yet practitioners often discover a frustrating gap between theory and practice:
|
||||
|
||||
The gap arises from several sources. Sparse matrices stored in dense format waste memory bandwidth loading zeros—the hardware cannot skip what it does not know is zero. Operations that could run in parallel execute sequentially due to data dependencies the compiler cannot resolve. Simple inputs receive the same computational budget as complex ones because the model has no mechanism to exit early. Closing the gap between "optimized on paper" and "optimized in practice" is the domain of our third optimization dimension: **architectural efficiency**. This dimension ensures that structural and precision optimizations translate into real-world speedups by aligning computation patterns with hardware capabilities.
|
||||
|
||||
|
||||
## Architectural Efficiency {#sec-model-compression-architectural-efficiency-8dd3}
|
||||
|
||||
Architectural efficiency optimization ensures that computations execute efficiently on target hardware by aligning model operations with processor capabilities and memory hierarchies. Where representation optimization determines *what* computations to perform and precision optimization determines *how precisely* to compute, architectural efficiency addresses *how* operations are scheduled, memory is accessed, and workloads adapt to input characteristics. This third dimension closes the gap between theoretical compression ratios and real-world speedups.
|
||||
@@ -4452,77 +4520,102 @@ Beyond reducing what data must be stored, substantial efficiency gains emerge fr
|
||||
# │ kernels_fused_str, saved_latency_ms_str
|
||||
# └─────────────────────────────────────────────────────────────────────────────
|
||||
from mlsys.formatting import fmt, check
|
||||
from mlsys.constants import KIB_TO_BYTES
|
||||
from mlsys.constants import KIB_TO_BYTES, MILLION
|
||||
|
||||
# --- Inputs (Conv-BN-ReLU) ---
|
||||
conv_channels_value = 256
|
||||
conv_spatial_value = 28
|
||||
bytes_per_element_value = 4
|
||||
class FusionCalc:
|
||||
"""Quantify latency and bandwidth benefits of Conv-BN-ReLU operator fusion on ResNet-50."""
|
||||
|
||||
# GEMM
|
||||
gemm_hidden_value = 768
|
||||
gemm_seq_value = 512
|
||||
# ┌── 1. PARAMETERS (Inputs) ──────────────────────────────────────────────
|
||||
# Conv-BN-ReLU layer geometry
|
||||
conv_channels = 256
|
||||
conv_spatial = 28
|
||||
bytes_per_element = 4 # FP32
|
||||
|
||||
# Memory Bandwidth Analysis (ResNet-50 layer)
|
||||
# Feature map: 256 channels × 28 × 28 spatial × 4 bytes/element (FP32)
|
||||
feat_map_mb_value = conv_channels_value * conv_spatial_value * conv_spatial_value * bytes_per_element_value / MILLION # SI MB
|
||||
weights_mb_value = 2.4
|
||||
bn_params_mb_value = 0.002
|
||||
# GEMM geometry
|
||||
gemm_hidden = 768
|
||||
gemm_seq = 512
|
||||
|
||||
# Kernel Launch
|
||||
kernels_unfused_value = 159
|
||||
kernels_fused_value = 53
|
||||
latency_per_kernel_us_value = 10
|
||||
# ResNet-50 layer memory baseline
|
||||
weights_mb = 2.4
|
||||
bn_params_mb = 0.002
|
||||
|
||||
# --- Process ---
|
||||
# Conv-BN-ReLU intermediate
|
||||
conv_bn_relu_intermediate_bytes = 2 * conv_channels_value * conv_spatial_value * conv_spatial_value * bytes_per_element_value
|
||||
conv_bn_relu_intermediate_mb_value = conv_bn_relu_intermediate_bytes / (1024**2)
|
||||
# Kernel launch overhead
|
||||
kernels_unfused = 159
|
||||
kernels_fused = 53
|
||||
latency_per_kernel_us = 10
|
||||
|
||||
# GEMM intermediate
|
||||
gemm_intermediate_bytes = gemm_hidden_value * gemm_seq_value * bytes_per_element_value
|
||||
gemm_intermediate_mb_value = gemm_intermediate_bytes / (1024**2)
|
||||
# ┌── 2. CALCULATION (The Physics) ────────────────────────────────────────
|
||||
# Feature map size (SI MB)
|
||||
feat_map_mb = conv_channels * conv_spatial * conv_spatial * bytes_per_element / MILLION
|
||||
|
||||
# Bandwidth Analysis
|
||||
unfused_conv_mb_value = feat_map_mb_value * 2 + weights_mb_value
|
||||
unfused_bn_mb_value = feat_map_mb_value * 2 + bn_params_mb_value
|
||||
unfused_relu_mb_value = feat_map_mb_value * 2
|
||||
total_unfused_mb_value = unfused_conv_mb_value + unfused_bn_mb_value + unfused_relu_mb_value
|
||||
# Conv-BN-ReLU intermediate (2 feature maps written: conv→BN boundary)
|
||||
conv_bn_relu_intermediate_mb = (
|
||||
2 * conv_channels * conv_spatial * conv_spatial * bytes_per_element / (1024**2)
|
||||
)
|
||||
|
||||
total_fused_mb_value = feat_map_mb_value * 2 + weights_mb_value
|
||||
bandwidth_reduction_pct_value = (1 - total_fused_mb_value / total_unfused_mb_value) * 100
|
||||
# GEMM intermediate
|
||||
gemm_intermediate_mb = gemm_hidden * gemm_seq * bytes_per_element / (1024**2)
|
||||
|
||||
# Kernel Launch
|
||||
saved_latency_us_value = (kernels_unfused_value - kernels_fused_value) * latency_per_kernel_us_value
|
||||
saved_latency_ms_value = saved_latency_us_value / 1000
|
||||
# Unfused bandwidth: Conv (feat*2 + weights) + BN (feat*2 + bn) + ReLU (feat*2)
|
||||
unfused_conv_mb = feat_map_mb * 2 + weights_mb
|
||||
unfused_bn_mb = feat_map_mb * 2 + bn_params_mb
|
||||
unfused_relu_mb = feat_map_mb * 2
|
||||
total_unfused_mb = unfused_conv_mb + unfused_bn_mb + unfused_relu_mb
|
||||
|
||||
# V100 timing analysis (memory-bound)
|
||||
v100_bw_gbs_local_value = v100_bw_gbs_value # from earlier cell
|
||||
unfused_time_us_value = total_unfused_mb_value / v100_bw_gbs_local_value * 1000 # MB / (GB/s) * 1000 = us
|
||||
fused_time_us_value = total_fused_mb_value / v100_bw_gbs_local_value * 1000
|
||||
fusion_speedup_value = unfused_time_us_value / fused_time_us_value
|
||||
# Fused bandwidth: read input + weights once, write output once
|
||||
total_fused_mb = feat_map_mb * 2 + weights_mb
|
||||
bandwidth_reduction_pct = (1 - total_fused_mb / total_unfused_mb) * 100
|
||||
|
||||
# --- Outputs (formatted strings for prose) ---
|
||||
conv_bn_relu_intermediate_mb_str = fmt(conv_bn_relu_intermediate_mb_value, precision=1, commas=False)
|
||||
gemm_intermediate_mb_str = fmt(gemm_intermediate_mb_value, precision=1, commas=False)
|
||||
# Kernel launch savings
|
||||
saved_latency_us = (kernels_unfused - kernels_fused) * latency_per_kernel_us
|
||||
saved_latency_ms = saved_latency_us / 1000
|
||||
|
||||
feat_map_kb_str = fmt(feat_map_mb_value * 1000, precision=0, commas=False)
|
||||
weights_mb_str = fmt(weights_mb_value, precision=1, commas=False)
|
||||
bn_params_kb_str = fmt(bn_params_mb_value * KIB_TO_BYTES, precision=0, commas=False)
|
||||
# V100 timing (memory-bound): MB / (GB/s) * 1000 = µs
|
||||
unfused_time_us = total_unfused_mb / v100_bw_gbs_value * 1000
|
||||
fused_time_us = total_fused_mb / v100_bw_gbs_value * 1000
|
||||
fusion_speedup = unfused_time_us / fused_time_us
|
||||
|
||||
unfused_conv_mb_str = fmt(unfused_conv_mb_value, precision=1, commas=False)
|
||||
unfused_bn_mb_str = fmt(unfused_bn_mb_value, precision=1, commas=False)
|
||||
unfused_relu_mb_str = fmt(unfused_relu_mb_value, precision=1, commas=False)
|
||||
total_unfused_mb_str = fmt(total_unfused_mb_value, precision=1, commas=False)
|
||||
total_fused_mb_str = fmt(total_fused_mb_value, precision=1, commas=False)
|
||||
bandwidth_reduction_pct_str = fmt(bandwidth_reduction_pct_value, precision=0, commas=False)
|
||||
# ┌── 3. INVARIANTS (Guardrails) ──────────────────────────────────────────
|
||||
check(bandwidth_reduction_pct > 40, "Fusion should reduce bandwidth by more than 40%.")
|
||||
check(fusion_speedup > 1, "Fused execution must be faster than unfused.")
|
||||
|
||||
kernels_unfused_str = fmt(kernels_unfused_value, precision=0, commas=False)
|
||||
kernels_fused_str = fmt(kernels_fused_value, precision=0, commas=False)
|
||||
saved_latency_ms_str = fmt(saved_latency_ms_value, precision=0, commas=False)
|
||||
unfused_time_us_str = fmt(unfused_time_us_value, precision=0, commas=False)
|
||||
fused_time_us_str = fmt(fused_time_us_value, precision=1, commas=False)
|
||||
fusion_speedup_str = fmt(fusion_speedup_value, precision=2, commas=False)
|
||||
# ┌── 4. OUTPUTS (Formatting) ─────────────────────────────────────────────
|
||||
conv_bn_relu_intermediate_mb_str = fmt(conv_bn_relu_intermediate_mb, precision=1, commas=False)
|
||||
gemm_intermediate_mb_str = fmt(gemm_intermediate_mb, precision=1, commas=False)
|
||||
feat_map_kb_str = fmt(feat_map_mb * 1000, precision=0, commas=False)
|
||||
weights_mb_str = fmt(weights_mb, precision=1, commas=False)
|
||||
bn_params_kb_str = fmt(bn_params_mb * KIB_TO_BYTES, precision=0, commas=False)
|
||||
unfused_conv_mb_str = fmt(unfused_conv_mb, precision=1, commas=False)
|
||||
unfused_bn_mb_str = fmt(unfused_bn_mb, precision=1, commas=False)
|
||||
unfused_relu_mb_str = fmt(unfused_relu_mb, precision=1, commas=False)
|
||||
total_unfused_mb_str = fmt(total_unfused_mb, precision=1, commas=False)
|
||||
total_fused_mb_str = fmt(total_fused_mb, precision=1, commas=False)
|
||||
bandwidth_reduction_pct_str = fmt(bandwidth_reduction_pct, precision=0, commas=False)
|
||||
kernels_unfused_str = fmt(kernels_unfused, precision=0, commas=False)
|
||||
kernels_fused_str = fmt(kernels_fused, precision=0, commas=False)
|
||||
saved_latency_ms_str = fmt(saved_latency_ms, precision=0, commas=False)
|
||||
unfused_time_us_str = fmt(unfused_time_us, precision=0, commas=False)
|
||||
fused_time_us_str = fmt(fused_time_us, precision=1, commas=False)
|
||||
fusion_speedup_str = fmt(fusion_speedup, precision=2, commas=False)
|
||||
|
||||
# ┌── EXPORTS (Bridge to Text) ─────────────────────────────────────────────────
|
||||
conv_bn_relu_intermediate_mb_str = FusionCalc.conv_bn_relu_intermediate_mb_str
|
||||
gemm_intermediate_mb_str = FusionCalc.gemm_intermediate_mb_str
|
||||
feat_map_kb_str = FusionCalc.feat_map_kb_str
|
||||
weights_mb_str = FusionCalc.weights_mb_str
|
||||
bn_params_kb_str = FusionCalc.bn_params_kb_str
|
||||
unfused_conv_mb_str = FusionCalc.unfused_conv_mb_str
|
||||
unfused_bn_mb_str = FusionCalc.unfused_bn_mb_str
|
||||
unfused_relu_mb_str = FusionCalc.unfused_relu_mb_str
|
||||
total_unfused_mb_str = FusionCalc.total_unfused_mb_str
|
||||
total_fused_mb_str = FusionCalc.total_fused_mb_str
|
||||
bandwidth_reduction_pct_str = FusionCalc.bandwidth_reduction_pct_str
|
||||
kernels_unfused_str = FusionCalc.kernels_unfused_str
|
||||
kernels_fused_str = FusionCalc.kernels_fused_str
|
||||
saved_latency_ms_str = FusionCalc.saved_latency_ms_str
|
||||
unfused_time_us_str = FusionCalc.unfused_time_us_str
|
||||
fused_time_us_str = FusionCalc.fused_time_us_str
|
||||
fusion_speedup_str = FusionCalc.fusion_speedup_str
|
||||
```
|
||||
|
||||
#### Operator Fusion {#sec-model-compression-operator-fusion-ac1d}
|
||||
@@ -4594,16 +4687,28 @@ def conv_bn_relu_fused(input, weight, gamma, beta, mean, var):
|
||||
# └─────────────────────────────────────────────────────────────────────────────
|
||||
from mlsys.formatting import fmt, check, md_math
|
||||
|
||||
# --- Inputs (transfer counts) ---
|
||||
unfused_transfers_value = 6 # read/write for each of conv, BN, ReLU
|
||||
fused_transfers_value = 2 # read input, write output
|
||||
class ConvFusionCalc:
|
||||
"""Demonstrate 3x memory traffic reduction from Conv-BN-ReLU fusion (6 transfers → 2)."""
|
||||
|
||||
# --- Process ---
|
||||
transfer_reduction_value = unfused_transfers_value / fused_transfers_value
|
||||
# ┌── 1. PARAMETERS (Inputs) ──────────────────────────────────────────────
|
||||
unfused_transfers = 6 # read/write for Conv, BN, ReLU
|
||||
fused_transfers = 2 # read input, write output
|
||||
|
||||
# --- Outputs (formatted strings for prose) ---
|
||||
transfer_reduction_str = fmt(transfer_reduction_value, precision=0, commas=False)
|
||||
conv_bn_relu_mem_md = md_math(f"2 \\times 256 \\times 28 \\times 28 \\times 4 \\text{{ bytes}} \\approx \\text{{{conv_bn_relu_intermediate_mb_str} MB}}")
|
||||
# ┌── 2. CALCULATION (The Physics) ────────────────────────────────────────
|
||||
transfer_reduction = unfused_transfers / fused_transfers
|
||||
|
||||
# ┌── 3. INVARIANTS (Guardrails) ──────────────────────────────────────────
|
||||
check(transfer_reduction == 3, "Conv-BN-ReLU fusion must yield exactly 3x transfer reduction.")
|
||||
|
||||
# ┌── 4. OUTPUTS (Formatting) ─────────────────────────────────────────────
|
||||
transfer_reduction_str = fmt(transfer_reduction, precision=0, commas=False)
|
||||
conv_bn_relu_mem_md = md_math(
|
||||
f"2 \\times 256 \\times 28 \\times 28 \\times 4 \\text{{ bytes}} \\approx \\text{{{conv_bn_relu_intermediate_mb_str} MB}}"
|
||||
)
|
||||
|
||||
# ┌── EXPORTS (Bridge to Text) ─────────────────────────────────────────────────
|
||||
transfer_reduction_str = ConvFusionCalc.transfer_reduction_str
|
||||
conv_bn_relu_mem_md = ConvFusionCalc.conv_bn_relu_mem_md
|
||||
```
|
||||
|
||||
The arithmetic operations remain identical, but memory traffic drops from 6 transfers to 2 transfers (`{python} transfer_reduction_str` $\times$ reduction). For a ResNet-50 layer with 256 channels and spatial size $28 \times 28$, this eliminates `{python} conv_bn_relu_mem_md` of intermediate memory traffic per layer.
|
||||
@@ -6276,7 +6381,6 @@ Unlike software functions that compose predictably, optimization techniques inte
|
||||
|
||||
With the three optimization dimensions now fully explored, practitioners need systematic guidance for translating this knowledge into deployment decisions.
|
||||
|
||||
|
||||
## Technique Selection {#sec-model-compression-technique-selection-ba16}
|
||||
|
||||
An engineer deploying a transformer model faces a concrete decision: the model exceeds the target device's memory by 3 $\times$, inference latency is 4 $\times$ above the SLO, and the power budget allows no more than 2 W sustained. Should she quantize first, prune first, distill to a smaller architecture, or combine techniques? The answer depends on which constraint is binding, what accuracy loss is tolerable, and how much engineering time is available. This section provides structured guidance for navigating that decision.
|
||||
@@ -6314,7 +6418,6 @@ These choices also depend on the available engineering budget. When fine-tuning
|
||||
|
||||
This decision framework provides starting points for individual technique selection. Validating that a chosen technique actually achieves its intended goal requires systematic profiling and measurement, which @sec-model-compression-efficiency-measurement-2424 formalizes in detail. However, production deployments rarely rely on a single technique. Combining pruning with quantization, or distillation with hardware-aware design, introduces interaction effects that can either amplify benefits or create unexpected accuracy degradation. The following section addresses how to sequence and combine techniques effectively.
|
||||
|
||||
|
||||
## Optimization Strategies {#sec-model-compression-optimization-strategies-f2f6}
|
||||
|
||||
The decision framework above guides individual technique selection, but the largest optimization gains emerge from combining multiple techniques. Because pruning, quantization, and architectural efficiency operate at different levels of the stack, they provide multiplicative benefits when sequenced appropriately.
|
||||
@@ -6528,7 +6631,6 @@ This example illustrates why sequencing matters: pruning first concentrates impo
|
||||
|
||||
With dozens of techniques across three optimization dimensions, rigorous measurement is essential for validating that optimizations achieve their intended goals. A practitioner who prunes, quantizes, and fuses without profiling the actual impact on target hardware is optimizing blindly.
|
||||
|
||||
|
||||
## Efficiency Measurement {#sec-model-compression-efficiency-measurement-2424}
|
||||
|
||||
A model quantized to INT8 should be 4 $\times$ smaller and roughly 3 $\times$ faster, but does it actually achieve those gains on the target hardware? Theoretical compression ratios and measured deployment improvements often diverge, sometimes dramatically, because real speedups depend on memory hierarchy effects, kernel implementations, and hardware utilization patterns that theory alone cannot predict. Translating theoretical compression ratios into measurable deployment improvements therefore requires systematic profiling and evaluation.
|
||||
@@ -6566,7 +6668,6 @@ With these comprehensive baselines in place, the measurement framework must trac
|
||||
|
||||
Rigorous measurement tells practitioners *whether* their optimizations succeeded, but the measurements themselves require tooling to perform. Profiling, quantization, pruning, and deployment all depend on software frameworks that automate otherwise prohibitively complex workflows. We turn now to the implementation tools that make these techniques practical.
|
||||
|
||||
|
||||
## Implementation Tools {#sec-model-compression-implementation-tools-4990}
|
||||
|
||||
Understanding optimization techniques is necessary but not sufficient; practical implementation relies on robust software support. Without framework tooling, quantization would require manual modification of model definitions and careful insertion of quantization operations throughout the network, while pruning would demand direct manipulation of weight tensors. Both become prohibitively complex as models scale.
|
||||
@@ -6655,7 +6756,6 @@ Sparsity heat maps show sparsity distribution across layers (@fig-sparse-heat-ma
|
||||
|
||||
With the implementation tools and visualization capabilities established, the natural question is: how do these techniques compare when a practitioner must choose among them? Each optimization approach carries distinct trade-offs in accuracy, training cost, and hardware requirements, and a structured comparison clarifies which to reach for first.
|
||||
|
||||
|
||||
## Technique Comparison {#sec-model-compression-technique-comparison-3142}
|
||||
|
||||
A comparative analysis across the three major approaches reveals how each addresses distinct aspects of the efficiency-accuracy trade-off. Pruning works best when sparse computation hardware is available and when reducing floating-point operations is critical. Quantization provides the most versatile approach with broad hardware support, making it ideal for diverse deployment scenarios. Knowledge distillation requires significant computational investment but produces consistently high-quality compressed models, making it the right choice when accuracy preservation is paramount. @tbl-optimization-comparison summarizes these trade-offs for systematic technique selection.
|
||||
@@ -6673,7 +6773,6 @@ These techniques combine synergistically, with quantization often applied after
|
||||
|
||||
With the complete optimization toolkit now surveyed—from individual techniques through combination strategies—the most instructive lessons often come not from what works but from what fails. The following fallacies and pitfalls capture the most common mistakes engineers make when applying these techniques, each grounded in the quantitative trade-offs we have established throughout the chapter.
|
||||
|
||||
|
||||
## Fallacies and Pitfalls {#sec-model-compression-fallacies-pitfalls-1b5e}
|
||||
|
||||
```{python}
|
||||
@@ -6773,7 +6872,6 @@ Teams apply post-training quantization (PTQ) to avoid retraining and achieve 96.
|
||||
|
||||
Teams achieve `{python} int8_size_reduction_str` $\times$ model size reduction through INT8 quantization and expect `{python} int8_size_reduction_str` $\times$ memory savings in deployment. In practice, runtime overhead erodes compression gains. Dequantization kernels add `{python} dequant_overhead_str`% latency overhead converting INT8 weights back to FP16. Pruned models with irregular sparsity achieve only 12% latency reduction despite `{python} param_removal_str`% parameter removal because hardware cannot skip zeroed weights efficiently. As @sec-model-compression-profiling-opportunity-analysis-477f demonstrates, a BERT model pruned to 50% sparsity and quantized to INT8 achieves `{python} actual_speedup_str`% end-to-end speedup rather than the expected `{python} expected_speedup_str` $\times$, because unstructured sparsity creates irregular memory access. Production workflows must profile *deployed* latency on target hardware, not extrapolate from compression ratios.
|
||||
|
||||
|
||||
## Summary {#sec-model-compression-summary-8229}
|
||||
|
||||
Model compression is not a bag of tricks but an engineering discipline built on three complementary dimensions: *structural optimization* determines what the model computes, *precision optimization* determines how precisely it computes, and *architectural optimization* determines how efficiently those computations execute on physical hardware. The most important lesson of this chapter is that these dimensions compose multiplicatively. Pruning alone might achieve 2 $\times$ compression; quantization alone might achieve 4 $\times$; but pruning, distillation, and quantization applied together can achieve 16 $\times$ — as BERT's compression from 440 MB to 28 MB demonstrates. The second lesson is equally important: theoretical compression ratios lie. A 4 $\times$ reduction in parameters translates to 4 $\times$ latency improvement only when the optimization aligns with the hardware's execution model. Unstructured sparsity on hardware that lacks sparse kernels achieves almost nothing; INT8 quantization on hardware without INT8 units achieves even less. Profile on target hardware, not paper metrics.
|
||||
|
||||
@@ -21,6 +21,26 @@ When training throughput is low, check MFU, communication fraction, and goodput
|
||||
```{python}
|
||||
#| label: appendix-c3-setup
|
||||
#| echo: false
|
||||
# ┌─────────────────────────────────────────────────────────────────────────────
|
||||
# │ C³ TAXONOMY — MASTER COMPUTATION
|
||||
# ├─────────────────────────────────────────────────────────────────────────────
|
||||
# │ Context: PERSISTENT — All values used throughout the C³ Taxonomy appendix:
|
||||
# │ @tbl-c3-dam-mapping, @tbl-c3-diagnostic-summary, @tbl-c3-traffic-light,
|
||||
# │ @tbl-c3-bottleneck-actions, three case studies, scorecard, and exercises.
|
||||
# │
|
||||
# │ Goal: Provide all C³ diagnostic constants — case study parameters, effective
|
||||
# │ FLOPS decomposition, and threshold strings — for the fleet-scale
|
||||
# │ bottleneck classification reference appendix.
|
||||
# │ Show: See individual section prose for formatted values. This cell provides
|
||||
# │ the physics; string attributes are display-ready.
|
||||
# │ How: calc_effective_flops() with MFU, scaling efficiency, and goodput ratio;
|
||||
# │ all results as raw floats extracted via .m_as() or .magnitude where unitless.
|
||||
# │
|
||||
# │ Imports: mlsys.constants (H100_FLOPS_FP16_TENSOR, MFU_*, SCALING_EFF_*, OVERHEAD_*, …)
|
||||
# │ mlsys.formulas (calc_effective_flops)
|
||||
# │ mlsys.formatting (fmt, check, md_math)
|
||||
# │ Exports: C3 = C3Taxonomy (accessed as C3.attribute in downstream cells)
|
||||
# └─────────────────────────────────────────────────────────────────────────────
|
||||
|
||||
import math
|
||||
from mlsys.constants import (
|
||||
@@ -35,15 +55,6 @@ from mlsys.constants import (
|
||||
from mlsys.formatting import fmt, check, md_math
|
||||
from mlsys.formulas import calc_effective_flops
|
||||
|
||||
# =============================================================================
|
||||
# PURPOSE
|
||||
# =============================================================================
|
||||
# Purpose: Compute all values for the C³ Taxonomy appendix.
|
||||
# Used in: Case studies, effective FLOPS, scorecard, and inline prose.
|
||||
#
|
||||
# Philosophy: C³ parallels D·A·M — three MECE axes for fleet-scale diagnosis.
|
||||
# Every computed value traces back to constants.py.
|
||||
|
||||
class C3Taxonomy:
|
||||
"""Namespace for C³ diagnostic examples."""
|
||||
|
||||
@@ -71,7 +82,7 @@ class C3Taxonomy:
|
||||
case3_oh_maintenance_pct = OVERHEAD_MAINTENANCE * 100
|
||||
|
||||
# Effective FLOPS calculation: 100K GPU cluster
|
||||
h100_tflops = H100_FLOPS_FP16_TENSOR.to(TFLOPs / second).magnitude
|
||||
h100_tflops = H100_FLOPS_FP16_TENSOR.m_as(TFLOPs / second)
|
||||
n_gpus_eff = 100_000
|
||||
peak_pflops = n_gpus_eff * h100_tflops / 1000 # PFLOPs
|
||||
goodput_all = 1.0 - (OVERHEAD_PIPELINE_BUBBLE +
|
||||
@@ -80,7 +91,7 @@ class C3Taxonomy:
|
||||
OVERHEAD_MAINTENANCE)
|
||||
effective_pflops = calc_effective_flops(
|
||||
peak_pflops, MFU_TRAINING_HIGH, SCALING_EFF_8192GPU, goodput_all
|
||||
)
|
||||
).magnitude # extract float; calc_effective_flops returns Quantity since formulas.py upgrade
|
||||
c3_tax = peak_pflops / effective_pflops
|
||||
eff_fraction = effective_pflops / peak_pflops
|
||||
|
||||
@@ -445,12 +456,8 @@ The gap between scaling-law predictions and observed training outcomes is, in la
|
||||
```{python}
|
||||
#| label: appendix-c3-effective-flops
|
||||
#| echo: false
|
||||
|
||||
# =============================================================================
|
||||
# PURPOSE
|
||||
# =============================================================================
|
||||
# Purpose: Format effective FLOPS values for the worked example.
|
||||
# Used in: Effective FLOPS worked example prose.
|
||||
# Goal: Alias C3Taxonomy strings for the 100K-GPU effective FLOPS callout prose.
|
||||
# Exports: peak_str, eff_str, eff_pct_str, c3_tax_str, mfu_str, scaling_str, goodput_str
|
||||
|
||||
peak_str = C3.peak_pflops_str
|
||||
eff_str = C3.effective_pflops_str
|
||||
|
||||
@@ -15,6 +15,23 @@ This appendix collects the reference numbers and compact models for fleet-scale
|
||||
```{python}
|
||||
#| label: appendix-fleet-setup
|
||||
#| echo: false
|
||||
# ┌─────────────────────────────────────────────────────────────────────────────
|
||||
# │ FLEET FOUNDATIONS — MASTER COMPUTATION
|
||||
# ├─────────────────────────────────────────────────────────────────────────────
|
||||
# │ Context: PERSISTENT — All values used throughout the Fleet Foundations
|
||||
# │ appendix: hardware reference table, MTBF tables, checkpoint sizing,
|
||||
# │ effective FLOPS, comm-compute ratio, and all prose inline values.
|
||||
# │
|
||||
# │ Goal: Provide all quantitative fleet engineering constants in one place
|
||||
# │ for the "Numbers Every Fleet Engineer Should Know" reference appendix.
|
||||
# │ Show: See individual section cells for formatted values. This cell provides
|
||||
# │ the physics; formatting cells convert to display strings.
|
||||
# │ How: pint Quantities from mlsys.constants; fleet formulas from formulas.py;
|
||||
# │ all results as typed Quantities or raw floats via .m_as().
|
||||
# │
|
||||
# │ Imports: mlsys.constants (*), mlsys.formulas (calc_*), mlsys.formatting (fmt, check)
|
||||
# │ Exports: FF = FleetFoundations (accessed as FF.attribute in downstream cells)
|
||||
# └─────────────────────────────────────────────────────────────────────────────
|
||||
|
||||
import math
|
||||
from mlsys.constants import *
|
||||
@@ -26,27 +43,13 @@ from mlsys.formulas import (
|
||||
calc_young_daly_interval, calc_checkpoint_size
|
||||
)
|
||||
|
||||
# =============================================================================
|
||||
# PURPOSE
|
||||
# =============================================================================
|
||||
# Purpose: Compute all values for the Fleet Foundations appendix.
|
||||
# Used in: Reference tables, worked examples, and inline prose throughout.
|
||||
#
|
||||
# Philosophy: Fleet-scale numbers emphasize RATIOS between tiers and
|
||||
# SCALING BEHAVIOR with cluster size. Absolute values are
|
||||
# current-generation snapshots; ratios persist across generations.
|
||||
|
||||
# =============================================================================
|
||||
# NETWORK HIERARCHY
|
||||
# =============================================================================
|
||||
|
||||
class FleetFoundations:
|
||||
"""Namespace for fleet-scale reference calculations."""
|
||||
|
||||
# ── Communication Numbers ────────────────────────────────────────────────
|
||||
# Bandwidth hierarchy (GB/s)
|
||||
nvlink_h100_bw = int(NVLINK_H100_BW.to(GB / second).magnitude)
|
||||
pcie5_bw = int(PCIE_GEN5_BW.to(GB / second).magnitude)
|
||||
nvlink_h100_bw = int(NVLINK_H100_BW.m_as(GB / second))
|
||||
pcie5_bw = int(PCIE_GEN5_BW.m_as(GB / second))
|
||||
ib_ndr_bw = INFINIBAND_NDR_BW_GBS
|
||||
ib_hdr_bw = INFINIBAND_HDR_BW_GBS
|
||||
ib_xdr_bw = INFINIBAND_XDR_BW_GBS
|
||||
@@ -95,28 +98,29 @@ class FleetFoundations:
|
||||
mtbf_100k_h = calc_mtbf_cluster(GPU_MTTF_HOURS, cl_mega)
|
||||
|
||||
# Convert to minutes for readability
|
||||
mtbf_256_min = mtbf_256_h * 60
|
||||
mtbf_2048_min = mtbf_2048_h * 60
|
||||
mtbf_8192_min = mtbf_8192_h * 60
|
||||
mtbf_100k_min = mtbf_100k_h * 60
|
||||
mtbf_256_min = mtbf_256_h.m_as(ureg.minute)
|
||||
mtbf_2048_min = mtbf_2048_h.m_as(ureg.minute)
|
||||
mtbf_8192_min = mtbf_8192_h.m_as(ureg.minute)
|
||||
mtbf_100k_min = mtbf_100k_h.m_as(ureg.minute)
|
||||
|
||||
# Failure probability for a 24-hour job (using hours consistently)
|
||||
pfail_256_24h = calc_failure_probability(mtbf_256_h, 24)
|
||||
pfail_2048_24h = calc_failure_probability(mtbf_2048_h, 24)
|
||||
pfail_8192_24h = calc_failure_probability(mtbf_8192_h, 24)
|
||||
pfail_100k_24h = calc_failure_probability(mtbf_100k_h, 24)
|
||||
# Failure probability for a 24-hour job
|
||||
_24h = 24 * ureg.hour
|
||||
pfail_256_24h = calc_failure_probability(mtbf_256_h, _24h)
|
||||
pfail_2048_24h = calc_failure_probability(mtbf_2048_h, _24h)
|
||||
pfail_8192_24h = calc_failure_probability(mtbf_8192_h, _24h)
|
||||
pfail_100k_24h = calc_failure_probability(mtbf_100k_h, _24h)
|
||||
|
||||
# Checkpoint sizes (bytes)
|
||||
ckpt_7b = calc_checkpoint_size(7e9)
|
||||
# Checkpoint sizes
|
||||
ckpt_7b = calc_checkpoint_size(7e9) # Quantity[byte]
|
||||
ckpt_70b = calc_checkpoint_size(70e9)
|
||||
ckpt_175b = calc_checkpoint_size(175e9)
|
||||
ckpt_1t = calc_checkpoint_size(1e12)
|
||||
|
||||
# Convert to GB
|
||||
ckpt_7b_gb = ckpt_7b / 1e9
|
||||
ckpt_70b_gb = ckpt_70b / 1e9
|
||||
ckpt_175b_gb = ckpt_175b / 1e9
|
||||
ckpt_1t_tb = ckpt_1t / 1e12
|
||||
# Extract in GB/TB
|
||||
ckpt_7b_gb = ckpt_7b.m_as(GB)
|
||||
ckpt_70b_gb = ckpt_70b.m_as(GB)
|
||||
ckpt_175b_gb = ckpt_175b.m_as(GB)
|
||||
ckpt_1t_tb = ckpt_1t.m_as(TB)
|
||||
|
||||
# Overhead budgets
|
||||
oh_pipeline = int(OVERHEAD_PIPELINE_BUBBLE * 100)
|
||||
@@ -125,20 +129,20 @@ class FleetFoundations:
|
||||
oh_maintenance = int(OVERHEAD_MAINTENANCE * 100)
|
||||
|
||||
# ── Hardware Reference ───────────────────────────────────────────────────
|
||||
h100_flops = int(H100_FLOPS_FP16_TENSOR.to(TFLOPs / second).magnitude)
|
||||
h100_bw_tbs = f"{H100_MEM_BW.to(TB / second).magnitude:.2f}"
|
||||
h100_cap = int(H100_MEM_CAPACITY.to(GiB).magnitude)
|
||||
h100_tdp = int(H100_TDP.magnitude)
|
||||
h100_flops = int(H100_FLOPS_FP16_TENSOR.m_as(TFLOPs / second))
|
||||
h100_bw_tbs = f"{H100_MEM_BW.m_as(TB / second):.2f}"
|
||||
h100_cap = int(H100_MEM_CAPACITY.m_as(GiB))
|
||||
h100_tdp = int(H100_TDP.m_as(watt))
|
||||
|
||||
b200_flops = int(B200_FLOPS_FP16_TENSOR.to(TFLOPs / second).magnitude)
|
||||
b200_bw_tbs = f"{B200_MEM_BW.to(TB / second).magnitude:.0f}"
|
||||
b200_cap = int(B200_MEM_CAPACITY.to(GiB).magnitude)
|
||||
b200_tdp = int(B200_TDP.magnitude)
|
||||
b200_flops = int(B200_FLOPS_FP16_TENSOR.m_as(TFLOPs / second))
|
||||
b200_bw_tbs = f"{B200_MEM_BW.m_as(TB / second):.0f}"
|
||||
b200_cap = int(B200_MEM_CAPACITY.m_as(GiB))
|
||||
b200_tdp = int(B200_TDP.m_as(watt))
|
||||
|
||||
tpuv5_flops = int(TPUV5P_FLOPS_BF16.to(TFLOPs / second).magnitude)
|
||||
tpuv5_bw_tbs = f"{TPUV5P_MEM_BW.to(TB / second).magnitude:.2f}"
|
||||
tpuv5_cap = int(TPUV5P_MEM_CAPACITY.to(GiB).magnitude)
|
||||
tpuv5_ici = int(TPUV5P_ICI_BW.to(GB / second).magnitude)
|
||||
tpuv5_flops = int(TPUV5P_FLOPS_BF16.m_as(TFLOPs / second))
|
||||
tpuv5_bw_tbs = f"{TPUV5P_MEM_BW.m_as(TB / second):.2f}"
|
||||
tpuv5_cap = int(TPUV5P_MEM_CAPACITY.m_as(GiB))
|
||||
tpuv5_ici = int(TPUV5P_ICI_BW.m_as(GB / second))
|
||||
|
||||
# ── Power and Sustainability ─────────────────────────────────────────────
|
||||
rack_trad = RACK_POWER_TRADITIONAL_KW
|
||||
@@ -154,17 +158,19 @@ class FleetFoundations:
|
||||
|
||||
# ── Effective FLOPS Example ──────────────────────────────────────────────
|
||||
# 1024-GPU cluster, H100, realistic overheads
|
||||
peak_1024 = 1024 * H100_FLOPS_FP16_TENSOR.to(TFLOPs / second).magnitude
|
||||
_peak_1024_qty = 1024 * H100_FLOPS_FP16_TENSOR # Quantity[TFLOPs/s]
|
||||
peak_1024 = _peak_1024_qty.m_as(TFLOPs / second) # raw float for display
|
||||
goodput_ratio = 1.0 - (OVERHEAD_PIPELINE_BUBBLE +
|
||||
OVERHEAD_CHECKPOINT +
|
||||
OVERHEAD_FAILURE_RECOVERY +
|
||||
OVERHEAD_MAINTENANCE)
|
||||
eff_flops_1024 = calc_effective_flops(
|
||||
peak_1024,
|
||||
_eff_flops_1024_qty = calc_effective_flops(
|
||||
_peak_1024_qty,
|
||||
MFU_TRAINING_HIGH,
|
||||
SCALING_EFF_1024GPU,
|
||||
goodput_ratio
|
||||
)
|
||||
) # Quantity[flop/second]
|
||||
eff_flops_1024 = _eff_flops_1024_qty.m_as(TFLOPs / second) # raw float for display
|
||||
eff_fraction = eff_flops_1024 / peak_1024
|
||||
|
||||
# ── Invariant Checks ─────────────────────────────────────────────────────
|
||||
@@ -289,12 +295,8 @@ Communication defines the boundaries of parallelism. These tables quantify the b
|
||||
```{python}
|
||||
#| label: fleet-comm-numbers
|
||||
#| echo: false
|
||||
|
||||
# =============================================================================
|
||||
# PURPOSE
|
||||
# =============================================================================
|
||||
# Purpose: Compute communication hierarchy values for inline references.
|
||||
# Used in: Communication numbers tables and prose.
|
||||
# Goal: Format communication bandwidth and latency strings for @tbl-fleet-bandwidth-hierarchy and @tbl-fleet-latency-hierarchy.
|
||||
# Exports: nvlink_bw_str, pcie5_bw_str, ib_*_str, tpuv5_ici_str, nvlink_to_ib_str, *_lat_str
|
||||
|
||||
# ── Bandwidth ratios ────────────────────────────────────────────────────────
|
||||
nvlink_bw_str = fmt(FF.nvlink_h100_bw, precision=0)
|
||||
@@ -386,12 +388,8 @@ At fleet scale, coordination---failure recovery, checkpointing, and maintenance-
|
||||
```{python}
|
||||
#| label: fleet-mtbf-table
|
||||
#| echo: false
|
||||
|
||||
# =============================================================================
|
||||
# PURPOSE
|
||||
# =============================================================================
|
||||
# Purpose: Format MTBF and failure probability values for the table.
|
||||
# Used in: MTBF by cluster size table.
|
||||
# Goal: Format MTBF hours, minutes, and P(failure) percentages for @tbl-fleet-mtbf.
|
||||
# Exports: mtbf_256_str, mtbf_2048_str, mtbf_8192_str, mtbf_100k_str, mtbf_*_min_str, pfail_*_str
|
||||
|
||||
mtbf_256_str = fmt(FF.mtbf_256_h, precision=1, commas=False)
|
||||
mtbf_2048_str = fmt(FF.mtbf_2048_h, precision=1, commas=False)
|
||||
@@ -432,12 +430,8 @@ Checkpointing is the primary recovery mechanism, and its cost depends on the mod
|
||||
```{python}
|
||||
#| label: fleet-checkpoint-sizes
|
||||
#| echo: false
|
||||
|
||||
# =============================================================================
|
||||
# PURPOSE
|
||||
# =============================================================================
|
||||
# Purpose: Format checkpoint sizes for the reference table.
|
||||
# Used in: Checkpoint size table.
|
||||
# Goal: Format checkpoint sizes in GB/TB for @tbl-fleet-checkpoint-sizes.
|
||||
# Exports: ckpt_7b_str, ckpt_70b_str, ckpt_175b_str, ckpt_1t_str
|
||||
|
||||
ckpt_7b_str = fmt(FF.ckpt_7b_gb, precision=0)
|
||||
ckpt_70b_str = fmt(FF.ckpt_70b_gb, precision=0)
|
||||
@@ -484,12 +478,8 @@ These numbers reflect the current generation of fleet-scale hardware. Use them f
|
||||
```{python}
|
||||
#| label: fleet-hardware-ref
|
||||
#| echo: false
|
||||
|
||||
# =============================================================================
|
||||
# PURPOSE
|
||||
# =============================================================================
|
||||
# Purpose: Format hardware reference values for the comparison table.
|
||||
# Used in: Current hardware reference table.
|
||||
# Goal: Format H100, B200, and TPU v5p specs for @tbl-fleet-hardware-ref.
|
||||
# Exports: h100_flops_str, h100_bw_str, h100_cap_str, h100_tdp_str, b200_*, tpuv5_*
|
||||
|
||||
h100_flops_str = fmt(FF.h100_flops, precision=0)
|
||||
h100_bw_str = FF.h100_bw_tbs
|
||||
@@ -547,36 +537,52 @@ Volume I introduced Amdahl's Law for a single machine, where the serial fraction
|
||||
```{python}
|
||||
#| label: fleet-amdahl-example
|
||||
#| echo: false
|
||||
# ┌─────────────────────────────────────────────────────────────────────────────
|
||||
# │ FLEET AMDAHL EXAMPLE
|
||||
# ├─────────────────────────────────────────────────────────────────────────────
|
||||
# │ Context: @sec-fleet-foundations-amdahls-fleet worked example
|
||||
# │
|
||||
# │ Goal: Compute Amdahl's Law speedup at 32/256/1024/8192 GPUs for 10% serial fraction.
|
||||
# │ Show: Speedup values and the Amdahl ceiling for inline prose.
|
||||
# │ How: calc_amdahls_speedup() from formulas.py; check() for invariants.
|
||||
# │
|
||||
# │ Imports: mlsys.formulas (calc_amdahls_speedup), mlsys.formatting (fmt, check)
|
||||
# │ Exports: s_fleet_pct_str, max_speedup_str, su_32_str, su_256_str, su_1024_str, su_8192_str
|
||||
# └─────────────────────────────────────────────────────────────────────────────
|
||||
|
||||
# =============================================================================
|
||||
# PURPOSE
|
||||
# =============================================================================
|
||||
# Purpose: Compute Amdahl's Law examples at fleet scale.
|
||||
# Used in: Amdahl's Law at Fleet Scale worked example.
|
||||
class FleetAmdahlExample:
|
||||
"""Amdahl's Law speedup at 32/256/1024/8192 GPUs for 10% serial fraction."""
|
||||
|
||||
# ── PARAMETERS ──────────────────────────────────────────────────────────────
|
||||
s_fleet = 0.10 # 10% serial fraction (communication + sync)
|
||||
n_values = [32, 256, 1024, 8192]
|
||||
# ┌── 1. PARAMETERS (Inputs) ──────────────────────────────────────────────
|
||||
s_fleet = 0.10
|
||||
n_values = [32, 256, 1024, 8192]
|
||||
|
||||
# ── CALCULATION ─────────────────────────────────────────────────────────────
|
||||
speedups = {}
|
||||
for n in n_values:
|
||||
su = calc_amdahls_speedup(1 - s_fleet, n)
|
||||
speedups[n] = su
|
||||
# ┌── 2. CALCULATION (The Physics) ────────────────────────────────────────
|
||||
speedups = {}
|
||||
for _n in n_values:
|
||||
speedups[_n] = calc_amdahls_speedup(1 - s_fleet, _n)
|
||||
|
||||
max_speedup = 1 / s_fleet
|
||||
max_speedup = 1 / s_fleet
|
||||
|
||||
# ── INVARIANTS ──────────────────────────────────────────────────────────────
|
||||
check(speedups[8192] < max_speedup, "Speedup must be below Amdahl limit")
|
||||
check(max_speedup == 10.0, "Max speedup for 10% serial fraction must be 10x")
|
||||
# ┌── 3. INVARIANTS (Guardrails) ──────────────────────────────────────────
|
||||
check(speedups[8192] < max_speedup, "Speedup must be below Amdahl limit")
|
||||
check(max_speedup == 10.0, "Max speedup for 10% serial fraction must be 10x")
|
||||
|
||||
# ── OUTPUTS ─────────────────────────────────────────────────────────────────
|
||||
s_fleet_pct_str = "10"
|
||||
max_speedup_str = fmt(max_speedup, precision=0, commas=False)
|
||||
su_32_str = fmt(speedups[32], precision=1, commas=False)
|
||||
su_256_str = fmt(speedups[256], precision=1, commas=False)
|
||||
su_1024_str = fmt(speedups[1024], precision=1, commas=False)
|
||||
su_8192_str = fmt(speedups[8192], precision=1, commas=False)
|
||||
# ┌── 4. OUTPUTS (Formatting) ─────────────────────────────────────────────
|
||||
s_fleet_pct_str = "10"
|
||||
max_speedup_str = fmt(max_speedup, precision=0, commas=False)
|
||||
su_32_str = fmt(speedups[32], precision=1, commas=False)
|
||||
su_256_str = fmt(speedups[256], precision=1, commas=False)
|
||||
su_1024_str = fmt(speedups[1024], precision=1, commas=False)
|
||||
su_8192_str = fmt(speedups[8192], precision=1, commas=False)
|
||||
|
||||
# ┌── EXPORTS (Bridge to Text) ─────────────────────────────────────────────────
|
||||
s_fleet_pct_str = FleetAmdahlExample.s_fleet_pct_str
|
||||
max_speedup_str = FleetAmdahlExample.max_speedup_str
|
||||
su_32_str = FleetAmdahlExample.su_32_str
|
||||
su_256_str = FleetAmdahlExample.su_256_str
|
||||
su_1024_str = FleetAmdahlExample.su_1024_str
|
||||
su_8192_str = FleetAmdahlExample.su_8192_str
|
||||
```
|
||||
|
||||
To see the fleet-scale implications, consider a training workload where `{python} s_fleet_pct_str`% of wall-clock time is spent in synchronization, communication, and other serial overhead. Amdahl's Law gives the following speedups:
|
||||
@@ -604,58 +610,72 @@ When $\rho < 1$, computation dominates and communication can be overlapped. When
|
||||
```{python}
|
||||
#| label: fleet-comm-comp-ratio
|
||||
#| echo: false
|
||||
# ┌─────────────────────────────────────────────────────────────────────────────
|
||||
# │ FLEET COMM-COMPUTE RATIO
|
||||
# ├─────────────────────────────────────────────────────────────────────────────
|
||||
# │ Context: @sec-fleet-foundations-comm-compute-ratio worked example (@tbl-fleet-comm-comp)
|
||||
# │
|
||||
# │ Goal: Compute ρ = T_comm / T_comp for 3 scenarios: 7B DP, 350M DP, tensor-parallel.
|
||||
# │ Show: AllReduce times in ms and ρ ratios for each scenario; ~0.1 for DP 7B, ~3 for DP 350M.
|
||||
# │ How: calc_ring_allreduce_time() with IB NDR params; NVLink BW for tensor-parallel.
|
||||
# │
|
||||
# │ Imports: mlsys.constants (INFINIBAND_NDR_BW_GBS, IB_NDR_LATENCY_US, NVLINK_H100_BW, GB, second)
|
||||
# │ Exports: ar_7b_ms_str, rho_7b_str, ar_350m_ms_str, rho_350m_str, rho_tp_str
|
||||
# └─────────────────────────────────────────────────────────────────────────────
|
||||
|
||||
# =============================================================================
|
||||
# PURPOSE
|
||||
# =============================================================================
|
||||
# Purpose: Compute communication-computation ratios for different scenarios.
|
||||
# Used in: Communication-computation ratio worked example.
|
||||
class FleetCommCompRatio:
|
||||
"""Communication-to-computation ratio ρ for three parallelism scenarios."""
|
||||
|
||||
# ── SCENARIO 1: Data parallelism, large model ──────────────────────────────
|
||||
# 7B model, 256 GPUs, IB NDR
|
||||
grad_bytes_7b = 7e9 * 2 # 7B params * 2 bytes (BF16 gradients)
|
||||
allreduce_time_7b = calc_ring_allreduce_time(
|
||||
message_bytes=grad_bytes_7b,
|
||||
n_gpus=256,
|
||||
bandwidth_bytes_s=INFINIBAND_NDR_BW_GBS * 1e9,
|
||||
latency_s=IB_NDR_LATENCY_US * 1e-6
|
||||
)
|
||||
# ── SCENARIO 1: Data parallelism, large model ──────────────────────────
|
||||
# 7B model, 256 GPUs, IB NDR
|
||||
grad_bytes_7b = 7e9 * 2 # 7B params * 2 bytes (BF16 gradients)
|
||||
allreduce_time_7b = calc_ring_allreduce_time(
|
||||
message_bytes=grad_bytes_7b,
|
||||
n_gpus=256,
|
||||
bandwidth_bytes_s=INFINIBAND_NDR_BW_GBS * 1e9,
|
||||
latency_s=IB_NDR_LATENCY_US * 1e-6
|
||||
) # Quantity[second]
|
||||
|
||||
# Computation time: assume ~50ms forward+backward per step
|
||||
comp_time_7b = 0.050 # 50 ms
|
||||
rho_7b = allreduce_time_7b / comp_time_7b
|
||||
comp_time_7b = 0.050 # 50 ms (seconds)
|
||||
rho_7b = allreduce_time_7b.m_as(ureg.second) / comp_time_7b
|
||||
|
||||
# ── SCENARIO 2: Data parallelism, small model ──────────────────────────────
|
||||
# 350M model, 256 GPUs, IB NDR
|
||||
grad_bytes_350m = 350e6 * 2 # 350M params * 2 bytes
|
||||
allreduce_time_350m = calc_ring_allreduce_time(
|
||||
message_bytes=grad_bytes_350m,
|
||||
n_gpus=256,
|
||||
bandwidth_bytes_s=INFINIBAND_NDR_BW_GBS * 1e9,
|
||||
latency_s=IB_NDR_LATENCY_US * 1e-6
|
||||
)
|
||||
comp_time_350m = 0.005 # 5 ms (smaller model)
|
||||
rho_350m = allreduce_time_350m / comp_time_350m
|
||||
# ── SCENARIO 2: Data parallelism, small model ──────────────────────────
|
||||
# 350M model, 256 GPUs, IB NDR
|
||||
grad_bytes_350m = 350e6 * 2 # 350M params * 2 bytes
|
||||
allreduce_time_350m = calc_ring_allreduce_time(
|
||||
message_bytes=grad_bytes_350m,
|
||||
n_gpus=256,
|
||||
bandwidth_bytes_s=INFINIBAND_NDR_BW_GBS * 1e9,
|
||||
latency_s=IB_NDR_LATENCY_US * 1e-6
|
||||
) # Quantity[second]
|
||||
comp_time_350m = 0.005 # 5 ms (seconds, smaller model)
|
||||
rho_350m = allreduce_time_350m.m_as(ureg.second) / comp_time_350m
|
||||
|
||||
# ── SCENARIO 3: Tensor parallelism, within node ────────────────────────────
|
||||
# Activation transfer: 8 GPUs, NVLink, ~16 MB per layer
|
||||
act_bytes = 16e6 # 16 MB
|
||||
act_transfer_time = act_bytes / (NVLINK_H100_BW.to(GB / second).magnitude * 1e9)
|
||||
comp_time_layer = 0.001 # 1 ms per layer
|
||||
rho_tp = act_transfer_time / comp_time_layer
|
||||
# ── SCENARIO 3: Tensor parallelism, within node ────────────────────────
|
||||
# Activation transfer: 8 GPUs, NVLink, ~16 MB per layer
|
||||
act_bytes = 16e6 # 16 MB
|
||||
act_transfer_time = act_bytes / (NVLINK_H100_BW.m_as(GB / second) * 1e9)
|
||||
comp_time_layer = 0.001 # 1 ms per layer
|
||||
rho_tp = act_transfer_time / comp_time_layer
|
||||
|
||||
# ── INVARIANTS ──────────────────────────────────────────────────────────────
|
||||
check(rho_7b > 0.1, "7B comm ratio must be non-trivial")
|
||||
check(rho_350m > 0.01, "350M comm ratio must be non-trivial")
|
||||
# ── INVARIANTS ──────────────────────────────────────────────────────────
|
||||
check(rho_7b > 0.1, "7B comm ratio must be non-trivial")
|
||||
check(rho_350m > 0.01, "350M comm ratio must be non-trivial")
|
||||
|
||||
# ── OUTPUTS ─────────────────────────────────────────────────────────────────
|
||||
ar_7b_ms_str = fmt(allreduce_time_7b * 1000, precision=1, commas=False)
|
||||
rho_7b_str = fmt(rho_7b, precision=2, commas=False)
|
||||
# ── OUTPUTS ─────────────────────────────────────────────────────────────
|
||||
ar_7b_ms_str = fmt(allreduce_time_7b.m_as(ureg.millisecond), precision=1, commas=False)
|
||||
rho_7b_str = fmt(rho_7b, precision=2, commas=False)
|
||||
ar_350m_ms_str = fmt(allreduce_time_350m.m_as(ureg.millisecond), precision=1, commas=False)
|
||||
rho_350m_str = fmt(rho_350m, precision=1, commas=False)
|
||||
rho_tp_str = fmt(rho_tp, precision=3, commas=False)
|
||||
|
||||
ar_350m_ms_str = fmt(allreduce_time_350m * 1000, precision=1, commas=False)
|
||||
rho_350m_str = fmt(rho_350m, precision=1, commas=False)
|
||||
|
||||
rho_tp_str = fmt(rho_tp, precision=3, commas=False)
|
||||
# ┌── EXPORTS (Bridge to Text) ─────────────────────────────────────────────────
|
||||
ar_7b_ms_str = FleetCommCompRatio.ar_7b_ms_str
|
||||
rho_7b_str = FleetCommCompRatio.rho_7b_str
|
||||
rho_7b = FleetCommCompRatio.rho_7b # raw float used in fmt() call in prose
|
||||
ar_350m_ms_str = FleetCommCompRatio.ar_350m_ms_str
|
||||
rho_350m_str = FleetCommCompRatio.rho_350m_str
|
||||
rho_tp_str = FleetCommCompRatio.rho_tp_str
|
||||
```
|
||||
|
||||
@tbl-fleet-comm-comp shows the ratio for three representative scenarios. The contrast between them reveals why parallelism strategy must match the workload.
|
||||
@@ -685,12 +705,8 @@ The key insight for fleet-scale ML is that weak scaling is not just a mathematic
|
||||
```{python}
|
||||
#| label: fleet-effective-flops
|
||||
#| echo: false
|
||||
|
||||
# =============================================================================
|
||||
# PURPOSE
|
||||
# =============================================================================
|
||||
# Purpose: Compute effective FLOPS for the compound loss example.
|
||||
# Used in: Effective FLOPS worked example.
|
||||
# Goal: Format peak and effective FLOPS for the 1,024-GPU compound loss callout.
|
||||
# Exports: peak_str, eff_str, eff_pct_str, goodput_pct_str, mfu_pct_str, scaling_pct_str
|
||||
|
||||
peak_str = fmt(FF.peak_1024, precision=0)
|
||||
eff_str = fmt(FF.eff_flops_1024, precision=0)
|
||||
|
||||
@@ -35,6 +35,28 @@ This appendix is designed as a *reference*. Use it when you need to move from in
|
||||
```{python}
|
||||
#| label: appendix-reliability-setup
|
||||
#| echo: false
|
||||
# ┌─────────────────────────────────────────────────────────────────────────────
|
||||
# │ RELIABILITY FOUNDATIONS — MASTER COMPUTATION
|
||||
# ├─────────────────────────────────────────────────────────────────────────────
|
||||
# │ Context: PERSISTENT — All values used throughout the Reliability Foundations
|
||||
# │ appendix: @tbl-component-fit, @tbl-mtbf-cluster, @tbl-failure-prob,
|
||||
# │ @tbl-checkpoint-size, @tbl-recovery-anatomy, @tbl-strategy-comparison,
|
||||
# │ @tbl-availability-stacking, and all Young-Daly worked examples.
|
||||
# │
|
||||
# │ Goal: Provide all reliability constants — FIT rates, MTBF cascade, Young-Daly
|
||||
# │ optimal checkpoint interval, recovery anatomy, and availability stacking —
|
||||
# │ for the "Failure as a Physical Constraint" reference appendix.
|
||||
# │ Show: See individual section cells for formatted values. This cell provides
|
||||
# │ the physics; formatting cells and f-strings convert to display strings.
|
||||
# │ How: pint Quantities from mlsys.constants; calc_mtbf_node, calc_mtbf_cluster,
|
||||
# │ calc_young_daly_interval, calc_failure_probability, calc_checkpoint_size,
|
||||
# │ calc_availability_stacked from formulas.py; all extractions via .m_as().
|
||||
# │
|
||||
# │ Imports: mlsys.constants (*), mlsys.formulas (calc_mtbf_*, calc_young_daly_interval,
|
||||
# │ calc_failure_probability, calc_checkpoint_size, calc_availability_stacked)
|
||||
# │ mlsys.formatting (fmt, check)
|
||||
# │ Exports: R = ReliabilityFoundations (accessed as R.attribute in downstream cells)
|
||||
# └─────────────────────────────────────────────────────────────────────────────
|
||||
|
||||
from mlsys.constants import *
|
||||
from mlsys.formatting import fmt, check
|
||||
@@ -103,8 +125,9 @@ class ReliabilityFoundations:
|
||||
|
||||
@classmethod
|
||||
def p_failure(cls, n_gpus, duration_hours):
|
||||
mtbf_h = cls.cluster_mtbf(n_gpus)
|
||||
return calc_failure_probability(mtbf_h, duration_hours)
|
||||
mtbf_h = cls.cluster_mtbf(n_gpus) # Quantity[hour]
|
||||
dur_h = duration_hours * ureg.hour # attach unit
|
||||
return calc_failure_probability(mtbf_h, dur_h)
|
||||
|
||||
# ┌── 5. CHECKPOINT SIZING ────────────────────────────────────────
|
||||
# Mixed-precision Adam: 16 bytes/param
|
||||
@@ -114,25 +137,28 @@ class ReliabilityFoundations:
|
||||
|
||||
@classmethod
|
||||
def ckpt_size_gb(cls, n_params):
|
||||
return calc_checkpoint_size(n_params, cls.bytes_per_param) / 1e9
|
||||
return calc_checkpoint_size(n_params, cls.bytes_per_param).m_as(GB)
|
||||
|
||||
# ┌── 6. YOUNG-DALY (10K cluster, 175B model) ────────────────────
|
||||
ckpt_175b_bytes = calc_checkpoint_size(175e9, 16)
|
||||
ckpt_175b_gb = ckpt_175b_bytes / 1e9
|
||||
ckpt_write_bw = CHECKPOINT_WRITE_BW_GBS # GB/s
|
||||
ckpt_write_time_s = ckpt_175b_gb / ckpt_write_bw
|
||||
ckpt_175b_bytes = calc_checkpoint_size(175e9, 16) # Quantity[byte]
|
||||
ckpt_175b_gb = ckpt_175b_bytes.m_as(GB) # raw float in GB
|
||||
ckpt_write_bw = CHECKPOINT_WRITE_BW_GBS # GB/s (raw float)
|
||||
ckpt_write_time_s = ckpt_175b_gb / ckpt_write_bw # raw float (seconds)
|
||||
|
||||
cluster_mtbf_10k_s = cluster_mtbf_10k * SEC_PER_HOUR
|
||||
tau_opt_s = calc_young_daly_interval(ckpt_write_time_s, cluster_mtbf_10k_s)
|
||||
tau_opt_min = tau_opt_s / SECONDS_PER_MINUTE
|
||||
cluster_mtbf_10k_s = cluster_mtbf_10k.m_as(ureg.second) # raw float (seconds)
|
||||
tau_opt_s = calc_young_daly_interval(ckpt_write_time_s, cluster_mtbf_10k_s) # Quantity[second]
|
||||
tau_opt_min = tau_opt_s.m_as(ureg.minute) # raw float in minutes
|
||||
|
||||
# ┌── 7. RECOVERY TIME ───────────────────────────────────────────
|
||||
t_detect = HEARTBEAT_TIMEOUT_S
|
||||
t_reschedule = RESCHEDULE_TIME_S
|
||||
t_reload_s = ckpt_write_time_s # same BW, same size
|
||||
t_detect = HEARTBEAT_TIMEOUT_S # raw float (seconds) — kept for table display
|
||||
t_reschedule = RESCHEDULE_TIME_S # raw float (seconds) — kept for table display
|
||||
t_reload_s = ckpt_write_time_s # raw float (seconds)
|
||||
# Replay: half the interval on average
|
||||
t_replay_s = tau_opt_s / 2
|
||||
t_recovery_total_s = t_detect + t_reschedule + t_reload_s + t_replay_s
|
||||
t_replay_s = tau_opt_s / 2 # Quantity[second]
|
||||
# Sum: attach units to raw seconds, then extract in minutes
|
||||
t_recovery_total_s = (
|
||||
(t_detect + t_reschedule + t_reload_s) * ureg.second + t_replay_s
|
||||
).m_as(ureg.minute) # raw float in minutes
|
||||
|
||||
# ┌── 8. GOODPUT ─────────────────────────────────────────────────
|
||||
overhead_ckpt = OVERHEAD_CHECKPOINT
|
||||
@@ -150,8 +176,8 @@ class ReliabilityFoundations:
|
||||
R = ReliabilityFoundations # short alias for inline use
|
||||
|
||||
# ┌── INVARIANTS ──────────────────────────────────────────────────────
|
||||
check(R.cluster_mtbf_10k < 5.0,
|
||||
f"10K cluster MTBF should be < 5 hours, got {R.cluster_mtbf_10k:.2f}")
|
||||
check(R.cluster_mtbf_10k.m_as(ureg.hour) < 5.0,
|
||||
f"10K cluster MTBF should be < 5 hours, got {R.cluster_mtbf_10k.m_as(ureg.hour):.2f}")
|
||||
check(R.tau_opt_min > 5 and R.tau_opt_min < 60,
|
||||
f"Young-Daly interval should be 5-60 min, got {R.tau_opt_min:.1f}")
|
||||
check(R.p_failure(10_000, 24) > 0.99,
|
||||
@@ -159,12 +185,12 @@ check(R.p_failure(10_000, 24) > 0.99,
|
||||
|
||||
# ┌── FORMATTED OUTPUTS ──────────────────────────────────────────────
|
||||
gpu_mttf_str = fmt(R.gpu_mttf, precision=0)
|
||||
node_mtbf_str = fmt(R.node_mtbf, precision=0)
|
||||
cluster_mtbf_10k_str = fmt(R.cluster_mtbf_10k, precision=2)
|
||||
node_mtbf_str = fmt(R.node_mtbf.m_as(ureg.hour), precision=0)
|
||||
cluster_mtbf_10k_str = fmt(R.cluster_mtbf_10k.m_as(ureg.hour), precision=2)
|
||||
tau_opt_min_str = fmt(R.tau_opt_min, precision=1)
|
||||
ckpt_175b_gb_str = fmt(R.ckpt_175b_gb, precision=0)
|
||||
ckpt_write_time_str = fmt(R.ckpt_write_time_s, precision=1)
|
||||
t_recovery_str = fmt(R.t_recovery_total_s / SECONDS_PER_MINUTE, precision=1)
|
||||
t_recovery_str = fmt(R.t_recovery_total_s, precision=1)
|
||||
```
|
||||
|
||||
## Failure Probability at Scale {#sec-reliability-foundations-failure-probability}
|
||||
@@ -188,8 +214,8 @@ $$ \text{MTTF} = \frac{10^9}{\text{FIT}} $$ {#eq-mttf-from-fit}
|
||||
```{python}
|
||||
#| label: component-fit-table
|
||||
#| echo: false
|
||||
|
||||
# Format component data for the table
|
||||
# Goal: Format per-component MTTF in years for @tbl-component-fit.
|
||||
# Exports: gpu_mttf_yr, hbm_mttf_yr, nic_mttf_yr, psu_mttf_yr, pcie_mttf_yr, cable_mttf_yr, tor_mttf_yr
|
||||
gpu_mttf_yr = f"{R.gpu_mttf / HOURS_PER_YEAR:.1f}"
|
||||
hbm_mttf_yr = f"{R.hbm_mttf / HOURS_PER_YEAR:.1f}"
|
||||
nic_mttf_yr = f"{R.nic_mttf / HOURS_PER_YEAR:.1f}"
|
||||
@@ -233,24 +259,24 @@ For a cluster of $N$ identical nodes, the same logic applies one level up:
|
||||
|
||||
$$ \text{MTBF}_\text{cluster} = \frac{\text{MTBF}_\text{node}}{N} $$ {#eq-mtbf-cluster}
|
||||
|
||||
This is the **MTBF cascade**: reliability degrades linearly with component count at each level, and the levels compound. A node with `{python} f"{R.node_mtbf:,.0f}"`-hour MTBF sounds reliable. A cluster of `{python} f"{R.nodes_10k:,}"` such nodes has an MTBF of just `{python} cluster_mtbf_10k_str` hours---a failure every few hours is the expected steady state.
|
||||
This is the **MTBF cascade**: reliability degrades linearly with component count at each level, and the levels compound. A node with `{python} f"{R.node_mtbf.m_as(ureg.hour):,.0f}"`-hour MTBF sounds reliable. A cluster of `{python} f"{R.nodes_10k:,}"` such nodes has an MTBF of just `{python} cluster_mtbf_10k_str` hours---a failure every few hours is the expected steady state.
|
||||
|
||||
@tbl-mtbf-cluster shows how cluster MTBF shrinks as fleet size grows.
|
||||
|
||||
```{python}
|
||||
#| label: mtbf-cluster-table
|
||||
#| echo: false
|
||||
|
||||
# Build MTBF table data
|
||||
# Goal: Build MTBF row data (hours or minutes, failures/day) for @tbl-mtbf-cluster.
|
||||
# Exports: mtbf_data list of dicts with "gpus", "nodes", "mtbf", "per_day" keys
|
||||
mtbf_data = []
|
||||
for n_gpus in R.cluster_sizes:
|
||||
n_nodes = R.nodes_for_gpus(n_gpus)
|
||||
mtbf_h = R.cluster_mtbf(n_gpus)
|
||||
if mtbf_h >= 1.0:
|
||||
mtbf_str = f"{mtbf_h:.1f} hours"
|
||||
mtbf_h_val = R.cluster_mtbf(n_gpus).m_as(ureg.hour) # raw float in hours
|
||||
if mtbf_h_val >= 1.0:
|
||||
mtbf_str = f"{mtbf_h_val:.1f} hours"
|
||||
else:
|
||||
mtbf_str = f"{mtbf_h * SECONDS_PER_MINUTE:.0f} minutes"
|
||||
per_day = 24 / mtbf_h
|
||||
mtbf_str = f"{mtbf_h_val * 60:.0f} minutes"
|
||||
per_day = 24 / mtbf_h_val
|
||||
mtbf_data.append({
|
||||
"gpus": f"{n_gpus:,}",
|
||||
"nodes": f"{n_nodes:,}",
|
||||
@@ -292,8 +318,8 @@ When $T_\text{job} \gg \text{MTBF}$, this probability approaches 1 rapidly. @tbl
|
||||
```{python}
|
||||
#| label: failure-probability-table
|
||||
#| echo: false
|
||||
|
||||
# Build failure probability matrix
|
||||
# Goal: Compute P(≥1 failure) matrix for @tbl-failure-prob across cluster sizes and job durations.
|
||||
# Exports: fp_data dict keyed by n_gpus; values are [1-day, 1-week, 30-day] probability strings
|
||||
dur_labels = ["1 Day", "1 Week", "30 Days"]
|
||||
fp_data = {}
|
||||
for n_gpus in R.cluster_sizes:
|
||||
@@ -370,6 +396,8 @@ $$ \text{Checkpoint Size} = N_\text{params} \times 16 \text{ bytes/param} $$ {#e
|
||||
```{python}
|
||||
#| label: checkpoint-sizing-table
|
||||
#| echo: false
|
||||
# Goal: Format checkpoint sizes and write times for @tbl-checkpoint-size across 7B–1T models.
|
||||
# Exports: ckpt_data list of dicts with "label", "ckpt_gb", "write_time" keys
|
||||
|
||||
ckpt_data = []
|
||||
for i, n_params in enumerate(R.model_sizes_params):
|
||||
@@ -407,28 +435,50 @@ At frontier scale (175B+ parameters), checkpoint sizes reach the terabyte range.
|
||||
```{python}
|
||||
#| label: worked-example-young-daly
|
||||
#| echo: false
|
||||
# ┌─────────────────────────────────────────────────────────────────────────────
|
||||
# │ YOUNG-DALY WORKED EXAMPLE
|
||||
# ├─────────────────────────────────────────────────────────────────────────────
|
||||
# │ Context: @sec-reliability-foundations-worked-example callout
|
||||
# │
|
||||
# │ Goal: Compute optimal checkpoint interval τ_opt for 175B model on 10K-GPU cluster;
|
||||
# │ show scaling to 20K GPUs.
|
||||
# │ Show: ~28 min optimal interval, ~X% checkpoint overhead, shorter interval at 20K GPUs.
|
||||
# │ How: calc_young_daly_interval(δ, MTBF_s) from R.ckpt_write_time_s and R.cluster_mtbf_10k_s.
|
||||
# │
|
||||
# │ Imports: mlsys.formulas (calc_young_daly_interval), mlsys.constants (GPUS_PER_HOST)
|
||||
# │ Exports: yd_mtbf_h_str, yd_delta_str, yd_tau_min_str, yd_overhead_str, tau_20k_min_str
|
||||
# └─────────────────────────────────────────────────────────────────────────────
|
||||
|
||||
# All values already computed in ReliabilityFoundations
|
||||
yd_mtbf_h = R.cluster_mtbf_10k
|
||||
yd_mtbf_s = R.cluster_mtbf_10k_s
|
||||
yd_delta = R.ckpt_write_time_s
|
||||
yd_tau_s = R.tau_opt_s
|
||||
yd_tau_min = R.tau_opt_min
|
||||
class WorkedExampleYoungDaly:
|
||||
"""Young-Daly optimal checkpoint interval for 175B model on 10K-GPU cluster."""
|
||||
# All values already computed in ReliabilityFoundations
|
||||
yd_mtbf_h = R.cluster_mtbf_10k # Quantity[hour]
|
||||
yd_mtbf_s = R.cluster_mtbf_10k_s # raw float (seconds)
|
||||
yd_delta = R.ckpt_write_time_s # raw float (seconds)
|
||||
yd_tau_s = R.tau_opt_s # Quantity[second]
|
||||
yd_tau_min = R.tau_opt_min # raw float in minutes
|
||||
|
||||
# Overhead from checkpointing alone
|
||||
yd_ckpt_overhead = (yd_delta / yd_tau_s) * 100
|
||||
# Overhead from checkpointing alone
|
||||
yd_ckpt_overhead = (yd_delta / yd_tau_s.m_as(ureg.second)) * 100
|
||||
|
||||
# What if MTBF halves (20K GPUs)?
|
||||
mtbf_20k_h = R.node_mtbf / (20_000 // GPUS_PER_HOST)
|
||||
mtbf_20k_s = mtbf_20k_h * SEC_PER_HOUR
|
||||
tau_20k_s = calc_young_daly_interval(yd_delta, mtbf_20k_s)
|
||||
tau_20k_min = tau_20k_s / SECONDS_PER_MINUTE
|
||||
# What if MTBF halves (20K GPUs)?
|
||||
mtbf_20k_h = R.node_mtbf / (20_000 // GPUS_PER_HOST) # Quantity[hour]
|
||||
mtbf_20k_s = mtbf_20k_h.m_as(ureg.second) # raw float (seconds)
|
||||
tau_20k_s = calc_young_daly_interval(yd_delta, mtbf_20k_s) # Quantity[second]
|
||||
tau_20k_min = tau_20k_s.m_as(ureg.minute) # raw float in minutes
|
||||
|
||||
yd_mtbf_h_str = fmt(yd_mtbf_h, precision=2)
|
||||
yd_delta_str = fmt(yd_delta, precision=1)
|
||||
yd_tau_min_str = fmt(yd_tau_min, precision=1)
|
||||
yd_overhead_str = fmt(yd_ckpt_overhead, precision=1)
|
||||
tau_20k_min_str = fmt(tau_20k_min, precision=1)
|
||||
yd_mtbf_h_str = fmt(yd_mtbf_h.m_as(ureg.hour), precision=2)
|
||||
yd_delta_str = fmt(yd_delta, precision=1)
|
||||
yd_tau_min_str = fmt(yd_tau_min, precision=1)
|
||||
yd_overhead_str = fmt(yd_ckpt_overhead, precision=1)
|
||||
tau_20k_min_str = fmt(tau_20k_min, precision=1)
|
||||
|
||||
# ┌── EXPORTS (Bridge to Text) ─────────────────────────────────────────────────
|
||||
yd_mtbf_h_str = WorkedExampleYoungDaly.yd_mtbf_h_str
|
||||
yd_delta_str = WorkedExampleYoungDaly.yd_delta_str
|
||||
yd_tau_min_str = WorkedExampleYoungDaly.yd_tau_min_str
|
||||
yd_overhead_str = WorkedExampleYoungDaly.yd_overhead_str
|
||||
tau_20k_min_str = WorkedExampleYoungDaly.tau_20k_min_str
|
||||
```
|
||||
|
||||
::: {.callout-example title="Young-Daly: 175B Model on a 10,000-GPU Cluster"}
|
||||
@@ -470,12 +520,14 @@ $$ T_\text{recovery} = T_\text{detect} + T_\text{reschedule} + T_\text{reload} +
|
||||
```{python}
|
||||
#| label: recovery-anatomy-table
|
||||
#| echo: false
|
||||
# Goal: Format recovery phase durations for @tbl-recovery-anatomy.
|
||||
# Exports: t_detect_str, t_reschedule_str, t_reload_str, t_replay_str, t_total_str
|
||||
|
||||
t_detect_str = f"{R.t_detect}"
|
||||
t_reschedule_str = f"{R.t_reschedule}"
|
||||
t_reload_str = fmt(R.t_reload_s, precision=1)
|
||||
t_replay_str = fmt(R.t_replay_s / SECONDS_PER_MINUTE, precision=1)
|
||||
t_total_str = fmt(R.t_recovery_total_s / SECONDS_PER_MINUTE, precision=1)
|
||||
t_replay_str = fmt(R.t_replay_s.m_as(ureg.minute), precision=1)
|
||||
t_total_str = fmt(R.t_recovery_total_s, precision=1)
|
||||
```
|
||||
|
||||
+----------------------------+---------------------------+-------------------------------------------------+
|
||||
@@ -567,6 +619,8 @@ where $A$ is the availability of a single replica and $k$ is the number of repli
|
||||
```{python}
|
||||
#| label: availability-stacking-table
|
||||
#| echo: false
|
||||
# Goal: Format availability, nines count, and annual downtime for @tbl-availability-stacking.
|
||||
# Exports: avail_data list of dicts with "k", "avail", "nines", "downtime" keys
|
||||
|
||||
avail_data = []
|
||||
for k in R.avail_replicas:
|
||||
|
||||
@@ -27,7 +27,8 @@ from mlsys.constants import (
|
||||
CLOUD_EGRESS_PER_GB, USD,
|
||||
STORAGE_COST_S3_STD, STORAGE_COST_GLACIER,
|
||||
STORAGE_COST_NVME_LOW, STORAGE_COST_NVME_HIGH,
|
||||
Mparam, Bparam, TFLOPs, GFLOPs
|
||||
Mparam, Bparam, TFLOPs, GFLOPs,
|
||||
watt
|
||||
)
|
||||
from mlsys.formatting import fmt, sci, check
|
||||
|
||||
@@ -77,13 +78,25 @@ Accelerators can compute faster than storage can feed them. A modern GPU process
|
||||
# ┌─────────────────────────────────────────────────────────────────────────────
|
||||
# │ STORAGE HIERARCHY AND MODEL SPECIFICATIONS
|
||||
# ├─────────────────────────────────────────────────────────────────────────────
|
||||
# │ Context: Used across the chapter for hierarchy tables and bottleneck analysis.
|
||||
# │ Context: @sec-data-storage storage hierarchy tables and I/O bottleneck
|
||||
# │ analysis paragraphs throughout the chapter.
|
||||
# │
|
||||
# │ Goal: Provide quantitative specs for hardware and lighthouse models.
|
||||
# │ Show: The massive gap between HBM bandwidth and disk I/O.
|
||||
# │ Goal: Establish the six-tier storage hierarchy gap by computing H100 HBM
|
||||
# │ bandwidth (H100_MEM_BW) vs NVMe sequential bandwidth (NVME_SEQUENTIAL_BW),
|
||||
# │ and estimate GPT-3 checkpoint write time (GPT3_PARAMS, FP16, at NVMe
|
||||
# │ vs network storage) to show the I/O bottleneck in fault tolerance.
|
||||
# │ Show: "3.35" TB/s H100 HBM vs "~7" GB/s NVMe — inline in the storage
|
||||
# │ hierarchy tier comparison and checkpoint I/O bottleneck paragraphs.
|
||||
# │ How: Direct .m_as() for each unit conversion; H100_TDP .m_as(watt).
|
||||
# │
|
||||
# │ Imports: mlsys.constants
|
||||
# │ Exports: a100_mem, h100_bw_tbs, gpt3_params_b, resnet_params_m, etc.
|
||||
# │ Imports: mlsys.constants (A100_MEM_CAPACITY, H100_MEM_CAPACITY, H100_MEM_BW,
|
||||
# │ H100_FLOPS_FP8_TENSOR, H100_FLOPS_FP16_TENSOR, H100_TDP,
|
||||
# │ GPT3_PARAMS, RESNET50_PARAMS, NVME_SEQUENTIAL_BW,
|
||||
# │ NVLINK_H100_BW, PCIE_GEN5_BW, GiB, TB, TFLOPs, GB, second,
|
||||
# │ watt, Bparam, Mparam)
|
||||
# │ Exports: a100_mem, h100_mem, h100_bw_tbs, h100_fp8_tflops, h100_fp16_tflops,
|
||||
# │ h100_tdp_w, gpt3_params_b, resnet_params_m, nvme_bw,
|
||||
# │ nvlink_bw_gbs, pcie5_bw_gbs
|
||||
# └─────────────────────────────────────────────────────────────────────────────
|
||||
import math
|
||||
|
||||
@@ -93,21 +106,21 @@ class StorageSetup:
|
||||
Namespace for global storage constants and specs.
|
||||
"""
|
||||
# GPU specs
|
||||
a100_mem = A100_MEM_CAPACITY.to(GiB).magnitude
|
||||
h100_mem = H100_MEM_CAPACITY.to(GiB).magnitude
|
||||
h100_bw = H100_MEM_BW.to(TB/second).magnitude
|
||||
h100_fp8 = H100_FLOPS_FP8_TENSOR.to(TFLOPs/second).magnitude
|
||||
h100_fp16 = H100_FLOPS_FP16_TENSOR.to(TFLOPs/second).magnitude
|
||||
h100_tdp = H100_TDP.magnitude
|
||||
a100_mem = A100_MEM_CAPACITY.m_as(GiB)
|
||||
h100_mem = H100_MEM_CAPACITY.m_as(GiB)
|
||||
h100_bw = H100_MEM_BW.m_as(TB/second)
|
||||
h100_fp8 = H100_FLOPS_FP8_TENSOR.m_as(TFLOPs/second)
|
||||
h100_fp16 = H100_FLOPS_FP16_TENSOR.m_as(TFLOPs/second)
|
||||
h100_tdp = H100_TDP.m_as(watt)
|
||||
|
||||
# Model specs
|
||||
gpt3_params = GPT3_PARAMS.to(Bparam).magnitude
|
||||
resnet_params = RESNET50_PARAMS.to(Mparam).magnitude
|
||||
gpt3_params = GPT3_PARAMS.m_as(Bparam)
|
||||
resnet_params = RESNET50_PARAMS.m_as(Mparam)
|
||||
|
||||
# Storage & Interconnect
|
||||
nvme_bw = NVME_SEQUENTIAL_BW.to(GB/second).magnitude
|
||||
nvlink_bw = NVLINK_H100_BW.to(GB/second).magnitude
|
||||
pcie5_bw = PCIE_GEN5_BW.to(GB/second).magnitude
|
||||
nvme_bw = NVME_SEQUENTIAL_BW.m_as(GB/second)
|
||||
nvlink_bw = NVLINK_H100_BW.m_as(GB/second)
|
||||
pcie5_bw = PCIE_GEN5_BW.m_as(GB/second)
|
||||
|
||||
# ┌── EXPORTS (Bridge to Text) ─────────────────────────────────────────────────
|
||||
a100_mem = f"{StorageSetup.a100_mem:.0f}"
|
||||
@@ -125,11 +138,11 @@ nvlink_bw_gbs = f"{StorageSetup.nvlink_bw:.0f}"
|
||||
pcie5_bw_gbs = f"{StorageSetup.pcie5_bw:.0f}"
|
||||
|
||||
# Storage
|
||||
nvme_bw = f"{NVME_SEQUENTIAL_BW.to(GB/second).magnitude:.1f}"
|
||||
nvme_bw = f"{NVME_SEQUENTIAL_BW.m_as(GB/second):.1f}"
|
||||
|
||||
# Interconnect
|
||||
nvlink_bw_gbs = f"{NVLINK_H100_BW.to(GB/second).magnitude:.0f}"
|
||||
pcie5_bw_gbs = f"{PCIE_GEN5_BW.to(GB/second).magnitude:.0f}"
|
||||
nvlink_bw_gbs = f"{NVLINK_H100_BW.m_as(GB/second):.0f}"
|
||||
pcie5_bw_gbs = f"{PCIE_GEN5_BW.m_as(GB/second):.0f}"
|
||||
|
||||
# ┌── P.I.C.O. ISOLATED SCENARIO ───────────────────────────────────────────────
|
||||
class StorageEconomics:
|
||||
|
||||
@@ -40,25 +40,66 @@ A single GPU fails perhaps once per year. A thousand GPUs experience failures da
|
||||
:::
|
||||
|
||||
```{python}
|
||||
#| label: fault-tolerance-setup
|
||||
#| echo: false
|
||||
#| label: fault-tolerance-setup
|
||||
# ┌─────────────────────────────────────────────────────────────────────────────
|
||||
# │ FAULT TOLERANCE CHAPTER SETUP
|
||||
# ├─────────────────────────────────────────────────────────────────────────────
|
||||
# │ Context: Chapter-wide registry — values used in §Young-Daly Law
|
||||
# │ (@eq-young-daly-applied, line ~1957), §Sharded Checkpointing (line ~2289),
|
||||
# │ and §Recovery Cost (line ~2365).
|
||||
# │
|
||||
# │ Goal: Pre-compute GPT-3 checkpoint size (weights + Adam states) and
|
||||
# │ per-worker shard size for 1000-worker training, motivating the
|
||||
# │ checkpoint-interval formula and distributed checkpoint design.
|
||||
# │ Show: gpt3_ckpt_tb="2.1" TB (full checkpoint),
|
||||
# │ gpt3_shard_gb="2.1" GB (per-worker shard at 1000 workers) — inline in prose.
|
||||
# │ How: Multiply GPT3_PARAMS.m_as(param) by bytes-per-param for each state;
|
||||
# │ convert result pint Quantity with .m_as(TB) and .m_as(GB).
|
||||
# │
|
||||
# │ Imports: mlsys.constants (GPT3_PARAMS, param, byte, TB, GB, BILLION),
|
||||
# │ mlsys.formatting (fmt, sci)
|
||||
# │ Exports: gpt3_params_b, gpt3_ckpt_tb, gpt3_adam_tb, gpt3_shard_gb
|
||||
# │ Note: PERSISTENT — gpt3_ckpt_tb used in §Young-Daly (line ~1957),
|
||||
# │ §Sharded Checkpointing (line ~2289), §Recovery (line ~2365, ~2385);
|
||||
# │ gpt3_shard_gb used in §Sharded Checkpointing (line ~2289), §Recovery (~2371, ~2385).
|
||||
# └─────────────────────────────────────────────────────────────────────────────
|
||||
|
||||
from mlsys.constants import *
|
||||
from mlsys.formatting import fmt, sci
|
||||
|
||||
# GPT-3 model parameters
|
||||
gpt3_params_b = f"{GPT3_PARAMS.to(param).magnitude / BILLION:.0f}"
|
||||
# ┌── P.I.C.O. ISOLATED SCENARIO ───────────────────────────────────────────────
|
||||
class FaultToleranceSetup:
|
||||
"""Namespace for GPT-3 checkpoint sizing and shard calculations."""
|
||||
|
||||
# GPT-3 checkpoint size: weights (2 bytes FP16) + Adam m (4 bytes) + Adam v (4 bytes) = 12 bytes/param
|
||||
gpt3_ckpt_bytes = GPT3_PARAMS.magnitude * 12 * byte
|
||||
gpt3_ckpt_tb = f"{gpt3_ckpt_bytes.to(TB).magnitude:.1f}"
|
||||
# ┌── 1. PARAMETERS (Inputs) ──────────────────────────────────────────────
|
||||
# GPT-3 checkpoint byte layout:
|
||||
# weights (2 bytes FP16) + Adam m (4 bytes) + Adam v (4 bytes) = 12 bytes/param
|
||||
bytes_full_ckpt = 12 # bytes per param: weights + Adam m + v
|
||||
bytes_adam_only = 8 # bytes per param: Adam m + v only
|
||||
n_workers = 1000 # workers for shard size calculation
|
||||
|
||||
# GPT-3 Adam optimizer state: m + v = 8 bytes/param
|
||||
gpt3_adam_bytes = GPT3_PARAMS.magnitude * 8 * byte
|
||||
gpt3_adam_tb = f"{gpt3_adam_bytes.to(TB).magnitude:.1f}"
|
||||
# ┌── 2. CALCULATION (The Physics) ────────────────────────────────────────
|
||||
# Full checkpoint: weights + optimizer states
|
||||
gpt3_ckpt_bytes = GPT3_PARAMS.m_as(param) * bytes_full_ckpt * byte
|
||||
|
||||
# Per-worker shard for 1000 workers
|
||||
gpt3_shard_gb = f"{gpt3_ckpt_bytes.to(GB).magnitude / 1000:.1f}"
|
||||
# Optimizer-only checkpoint: Adam m + v (no weights)
|
||||
gpt3_adam_bytes = GPT3_PARAMS.m_as(param) * bytes_adam_only * byte
|
||||
|
||||
# ┌── 3. INVARIANTS (Guardrails) ──────────────────────────────────────────
|
||||
# No check() calls needed — values are monotone functions of constants.
|
||||
|
||||
# ┌── 4. OUTPUTS (Formatting) ─────────────────────────────────────────────
|
||||
gpt3_params_b = f"{GPT3_PARAMS.m_as(param) / BILLION:.0f}"
|
||||
gpt3_ckpt_tb = f"{gpt3_ckpt_bytes.m_as(TB):.1f}"
|
||||
gpt3_adam_tb = f"{gpt3_adam_bytes.m_as(TB):.1f}"
|
||||
gpt3_shard_gb = f"{gpt3_ckpt_bytes.m_as(GB) / n_workers:.1f}"
|
||||
|
||||
# ┌── EXPORTS (Bridge to Text) ─────────────────────────────────────────────────
|
||||
gpt3_params_b = FaultToleranceSetup.gpt3_params_b
|
||||
gpt3_ckpt_tb = FaultToleranceSetup.gpt3_ckpt_tb
|
||||
gpt3_adam_tb = FaultToleranceSetup.gpt3_adam_tb
|
||||
gpt3_shard_gb = FaultToleranceSetup.gpt3_shard_gb
|
||||
```
|
||||
|
||||
## Failure Analysis at Scale {#sec-fault-tolerance-reliability-reliability-failure-analysis-scale-6b4b}
|
||||
@@ -2123,45 +2164,88 @@ Imagine 10,000 GPUs, each holding a 10 GB shard of the model state, simultaneous
|
||||
While @tbl-checkpoint-overhead-by-model suggests modest overhead percentages, real deployments often encounter checkpoint times far exceeding these theoretical estimates. Diagnosing such discrepancies requires examining the full system stack.
|
||||
|
||||
```{python}
|
||||
#| label: checkpoint-debug-calc
|
||||
#| echo: false
|
||||
#| label: checkpoint-debug-calc
|
||||
# ┌─────────────────────────────────────────────────────────────────────────────
|
||||
# │ CHECKPOINT DEBUG CALCULATION
|
||||
# ├─────────────────────────────────────────────────────────────────────────────
|
||||
# │ Context: "Debugging Checkpoint Overhead" callout in §Checkpoint Overhead.
|
||||
# │
|
||||
# │ Goal: Diagnose why a 70B model checkpoint takes 10 minutes instead of
|
||||
# │ 2 minutes on an NFS-backed cluster, by computing theoretical bandwidth
|
||||
# │ limits and contention-induced effective throughput per node.
|
||||
# │ Show: total_ckpt_gb_str="420" GB, nfs_gbs_str="1.25" GB/s,
|
||||
# │ min_write_min_str="5.6" min, per_node_mbs_str="20" MB/s,
|
||||
# │ serialized_min_str="5,600" min — inline in the Fleet Stack diagnosis.
|
||||
# │ How: Compute weights + optimizer state size in GB; derive NFS bandwidth in
|
||||
# │ GB/s (10 Gbps / 8); calculate min write time and per-node bandwidth
|
||||
# │ under contention from 64 concurrent nodes.
|
||||
# │
|
||||
# │ Imports: (none — pure Python arithmetic, no pint quantities)
|
||||
# │ Exports: weights_gb_str, optimizer_gb_str, total_ckpt_gb_str, nfs_gbs_str,
|
||||
# │ min_write_s_str, min_write_min_str, per_node_mbs_str, serialized_min_str,
|
||||
# │ extended_weeks_str, extra_cost_k_str
|
||||
# └─────────────────────────────────────────────────────────────────────────────
|
||||
|
||||
# 70B model checkpoint sizing
|
||||
model_params_b = 70 # billions
|
||||
bytes_per_param = 2 # BF16
|
||||
weights_gb = model_params_b * bytes_per_param # 140 GB
|
||||
optimizer_gb = weights_gb * 2 # Adam first + second moments
|
||||
total_ckpt_gb = weights_gb + optimizer_gb # 420 GB
|
||||
class CheckpointDebugCalc:
|
||||
"""Diagnose 70B checkpoint overhead on NFS-backed cluster."""
|
||||
|
||||
# Storage constraints
|
||||
nfs_gbps = 10 # Gbps network
|
||||
nfs_gbs = nfs_gbps / 8 # 1.25 GB/s
|
||||
min_write_s = total_ckpt_gb / nfs_gbs # seconds
|
||||
min_write_min = min_write_s / 60 # minutes
|
||||
# ┌── 1. PARAMETERS (Inputs) ──────────────────────────────────────────────
|
||||
model_params_b = 70 # 70B parameter model
|
||||
bytes_per_param = 2 # BF16 weights
|
||||
nfs_gbps = 10 # NFS network attachment bandwidth in Gbps
|
||||
n_nodes = 64 # nodes writing simultaneously
|
||||
overhead_pct = 30 # observed training throughput loss %
|
||||
base_weeks = 2 # baseline training duration (weeks)
|
||||
extra_cost_k = 500 # additional cost from extended training ($K)
|
||||
|
||||
# Contention analysis
|
||||
n_nodes = 64
|
||||
per_node_gbs = nfs_gbs / n_nodes # GB/s per node
|
||||
per_node_mbs = per_node_gbs * 1000 # MB/s per node
|
||||
serialized_min = (total_ckpt_gb / per_node_gbs) / 60
|
||||
# ┌── 2. CALCULATION (The Physics) ────────────────────────────────────────
|
||||
# Model state sizing
|
||||
weights_gb = model_params_b * bytes_per_param # 140 GB
|
||||
optimizer_gb = weights_gb * 2 # Adam m + v moments
|
||||
total_ckpt_gb = weights_gb + optimizer_gb # 420 GB
|
||||
|
||||
# Training extension
|
||||
overhead_pct = 30
|
||||
base_weeks = 2
|
||||
extended_weeks = base_weeks * (1 + overhead_pct / 100)
|
||||
extra_cost_k = 500 # $K
|
||||
# Storage bandwidth limits
|
||||
nfs_gbs = nfs_gbps / 8 # 1.25 GB/s
|
||||
min_write_s = total_ckpt_gb / nfs_gbs # theoretical minimum seconds
|
||||
min_write_min = min_write_s / 60 # convert to minutes
|
||||
|
||||
# Format strings
|
||||
weights_gb_str = f"{weights_gb:.0f}"
|
||||
optimizer_gb_str = f"{optimizer_gb:.0f}"
|
||||
total_ckpt_gb_str = f"{total_ckpt_gb:.0f}"
|
||||
nfs_gbs_str = f"{nfs_gbs}"
|
||||
min_write_s_str = f"{min_write_s:.0f}"
|
||||
min_write_min_str = f"{min_write_min:.1f}"
|
||||
per_node_mbs_str = f"{per_node_mbs:.0f}"
|
||||
serialized_min_str = f"{serialized_min:.0f}"
|
||||
extended_weeks_str = f"{extended_weeks:.1f}"
|
||||
extra_cost_k_str = f"{extra_cost_k}"
|
||||
# Contention: 64 nodes sharing the NFS bandwidth
|
||||
per_node_gbs = nfs_gbs / n_nodes # GB/s per node under contention
|
||||
per_node_mbs = per_node_gbs * 1000 # MB/s per node
|
||||
serialized_min = (total_ckpt_gb / per_node_gbs) / 60 # worst-case serialized write time
|
||||
|
||||
# Training schedule impact
|
||||
extended_weeks = base_weeks * (1 + overhead_pct / 100)
|
||||
|
||||
# ┌── 3. INVARIANTS (Guardrails) ──────────────────────────────────────────
|
||||
assert min_write_min < 10, "Theoretical minimum must be less than observed 10 minutes"
|
||||
assert serialized_min > min_write_min, "Contention time must exceed theoretical minimum"
|
||||
|
||||
# ┌── 4. OUTPUTS (Formatting) ─────────────────────────────────────────────
|
||||
weights_gb_str = f"{weights_gb:.0f}"
|
||||
optimizer_gb_str = f"{optimizer_gb:.0f}"
|
||||
total_ckpt_gb_str = f"{total_ckpt_gb:.0f}"
|
||||
nfs_gbs_str = f"{nfs_gbs}"
|
||||
min_write_s_str = f"{min_write_s:.0f}"
|
||||
min_write_min_str = f"{min_write_min:.1f}"
|
||||
per_node_mbs_str = f"{per_node_mbs:.0f}"
|
||||
serialized_min_str = f"{serialized_min:.0f}"
|
||||
extended_weeks_str = f"{extended_weeks:.1f}"
|
||||
extra_cost_k_str = f"{extra_cost_k}"
|
||||
|
||||
# ┌── EXPORTS (Bridge to Text) ─────────────────────────────────────────────────
|
||||
weights_gb_str = CheckpointDebugCalc.weights_gb_str
|
||||
optimizer_gb_str = CheckpointDebugCalc.optimizer_gb_str
|
||||
total_ckpt_gb_str = CheckpointDebugCalc.total_ckpt_gb_str
|
||||
nfs_gbs_str = CheckpointDebugCalc.nfs_gbs_str
|
||||
min_write_s_str = CheckpointDebugCalc.min_write_s_str
|
||||
min_write_min_str = CheckpointDebugCalc.min_write_min_str
|
||||
per_node_gbs = CheckpointDebugCalc.per_node_gbs
|
||||
per_node_mbs_str = CheckpointDebugCalc.per_node_mbs_str
|
||||
serialized_min_str = CheckpointDebugCalc.serialized_min_str
|
||||
extended_weeks_str = CheckpointDebugCalc.extended_weeks_str
|
||||
extra_cost_k_str = CheckpointDebugCalc.extra_cost_k_str
|
||||
```
|
||||
|
||||
::: {.callout-example title="Debugging Checkpoint Overhead"}
|
||||
|
||||
Reference in New Issue
Block a user