refactor: use fmt_percent across Vol 1 and Vol 2 to prevent Pint precision bugs

This commit standardizes percentage formatting across the entire codebase to prevent critical rendering bugs (like the `19250000000000%` effective utilization bug in Vol 2).

Root Cause:
When dividing two Pint Quantities (e.g., `flop/second` by `TFLOPs/second`), Pint creates a mixed unit (`flop/TFLOPs`). The raw `.magnitude` of this fraction is $10^{12}$. When passed to `fmt(x * 100)`, it multiplied that massive magnitude by 100, resulting in an incorrect display.

Fix:
1. Fortified `fmt_percent` and `display_percent` in `mlsys/formatting.py` to defensively strip units using `.m_as('')`. This forces Pint to cancel out the units (e.g., `flop/TFLOPs` becomes `1.0`) *before* extracting the number.
2. Replaced all instances of `fmt(X * 100)` with the fortified `fmt_percent(X)` across Vol 1 and Vol 2.
3. Fixed inline f-strings in `appendix_assumptions.qmd` by moving formatting logic into the Python setup cell as `_str` variables, adhering to the book's standard practice.

Validation:
- Audited all `.magnitude` extractions in the codebase to ensure they are safe (e.g., explicitly converting to dimensionless units first).
- Ran `validate_inline_refs.py` and confirmed no Python variables are trapped inside LaTeX math mode.
- Successfully built full PDFs for both Volume 1 and Volume 2.
This commit is contained in:
Vijay Janapa Reddi
2026-02-26 20:54:12 -05:00
parent 96336ab0c6
commit 303cd26669
11 changed files with 172 additions and 165 deletions

View File

@@ -43,7 +43,7 @@ from mlsys.constants import (
BITS_PER_BYTE, SECONDS_PER_MINUTE, SEC_PER_HOUR, SEC_PER_DAY,
KIB_TO_BYTES, MILLION, DAYS_PER_MONTH
)
from mlsys.formatting import fmt, check
from mlsys.formatting import fmt_percent, fmt, check
# ┌── LEGO ───────────────────────────────────────────────
# Scenarios: Data Gravity, Serialization, Data Algebra, Long Tails
@@ -259,7 +259,7 @@ In systems, the mean is often misleading. Latency distributions are almost alway
```{python}
#| label: tail-latency-probability
#| echo: false
from mlsys.formatting import fmt, md_math
from mlsys.formatting import fmt_percent, fmt, md_math
# =============================================================================
# PURPOSE
@@ -283,7 +283,7 @@ p_slow_value = 1 - p_all_fast_value
# OUTPUT
# =============================================================================
p_all_fast_str = fmt(p_all_fast_value, precision=3, commas=False)
p_slow_pct_str = fmt(p_slow_value * 100, precision=1, commas=False)
p_slow_pct_str = fmt_percent(p_slow_value, precision=1, commas=False)
p_slow_eq = md_math(f"P(\\text{{Slow}}) = 1 - ({p_fast_value})^{{{n_requests_value}}} \\approx 1 - {p_all_fast_value:.3f} = {p_slow_pct_str}\\%")
```

View File

@@ -93,7 +93,7 @@ Every preceding chapter introduced decisions with measurable consequences: which
from mlsys import Hardware, Models
from mlsys.constants import *
from mlsys.formatting import fmt, sci
from mlsys.formatting import fmt_percent, fmt, sci
class BenchmarkingSetup:
"""Chapter-wide hardware and model constants for all benchmarking sections and callouts."""
@@ -557,7 +557,7 @@ Effective benchmark interpretation requires knowing the performance characterist
# └─────────────────────────────────────────────────────────────────────────────
from mlsys.constants import A100_MEM_BW, A100_FLOPS_FP16_TENSOR, TB, TFLOPs, second
from mlsys.constants import BILLION, MILLION
from mlsys.formatting import fmt, check
from mlsys.formatting import fmt_percent, fmt, check
# ┌── LEGO ───────────────────────────────────────────────
class RooflineExamples:
@@ -611,7 +611,7 @@ class RooflineExamples:
bert_ai_b1_str = fmt(bert_ai_b1, precision=0, commas=False)
bert_perf_b1_str = fmt(bert_perf_b1, precision=0, commas=False)
bert_util_b1_str = fmt(bert_util_b1, precision=0, commas=False)
utilization_peak_pct_str = fmt(bert_util_peak * 100, precision=0, commas=False)
utilization_peak_pct_str = fmt_percent(bert_util_peak, precision=0, commas=False)
# ┌── EXPORTS (Bridge to Text) ─────────────────────────────────────────────────
a100_tflops_fp16_str = RooflineExamples.a100_tflops_fp16_str
@@ -670,7 +670,7 @@ from mlsys.constants import (
A100_FLOPS_FP16_TENSOR, TFLOPs, second, BILLION, MILLION,
Mparam, Bparam, BYTES_FP32, MB, TB
)
from mlsys.formatting import fmt, check
from mlsys.formatting import fmt_percent, fmt, check
# ┌── LEGO ───────────────────────────────────────────────
class BertRoofline:
@@ -739,7 +739,7 @@ class BertRoofline:
batch32_str = str(batch_32)
utilization_peak_str = f"{util_peak}"
utilization_peak_pct_str = fmt(util_peak * 100, precision=0, commas=False)
utilization_peak_pct_str = fmt_percent(util_peak, precision=0, commas=False)
# Re-export A100 constants for this cell context
a100_tflops_fp16_str = fmt(peak_flops, precision=0, commas=False)
@@ -1907,7 +1907,7 @@ A key metric for evaluating parallelism is *scaling efficiency*, which quantifie
# │ Exports: ideal_str, eff_str, loss_str, eff_denom_str, t1_hours_str,
# │ n_gpus_str, tn_hours_str, scaling_eq_str
# └─────────────────────────────────────────────────────────────────────────────
from mlsys.formatting import fmt, check
from mlsys.formatting import fmt_percent, fmt, check
class ScalingEfficiencyCalc:
"""Strong scaling efficiency for 8-GPU ResNet-50 training: 75% efficiency, 25% overhead loss."""
@@ -2014,7 +2014,7 @@ Training large-scale machine learning models requires substantial computational
# │ e_int8_total_str, s_load_str, s_compute_str, s_total_str,
# │ e_fp32_load_mj_str, e_fp32_compute_mj_str
# └─────────────────────────────────────────────────────────────────────────────
from mlsys.formatting import fmt, check
from mlsys.formatting import fmt_percent, fmt, check
from mlsys.constants import ENERGY_DRAM_PJ_PER_BYTE, ENERGY_FLOP_FP32_PJ, ENERGY_FLOP_INT8_PJ
class EnergyBreakdownCalc:
@@ -2360,7 +2360,7 @@ These component-level contributions explain why optimizing any single stage yiel
# │ bench_opt_total_str, bench_opt_inf_str, amdahl_ceiling_str,
# │ preprocess_pct_str, preprocess_fraction_str
# └─────────────────────────────────────────────────────────────────────────────
from mlsys.formatting import fmt, check
from mlsys.formatting import fmt_percent, fmt, check
class AmdahlBenchmarkCalc:
"""Amdahl ceiling: 5× inference speedup → only 1.8× end-to-end when preprocessing dominates."""
@@ -2384,7 +2384,7 @@ class AmdahlBenchmarkCalc:
bench_opt_total_str = fmt(bench_opt_total_ms, precision=0, commas=False)
bench_opt_inf_str = fmt(bench_opt_inference_ms, precision=0, commas=False)
amdahl_ceiling_str = fmt(amdahl_ceiling, precision=2, commas=False)
preprocess_pct_str = fmt(preprocess_fraction * 100, precision=0, commas=False)
preprocess_pct_str = fmt_percent(preprocess_fraction, precision=0, commas=False)
preprocess_fraction_str = fmt(preprocess_fraction, precision=2, commas=False)
# ┌── EXPORTS (Bridge to Text) ─────────────────────────────────────────────────
@@ -2640,7 +2640,7 @@ These scenarios explain why the same hardware can report dramatically different
# │ inference_speedup_str, e2e_speedup_str, edgetpu_power_ratio_str,
# │ cpu_energy_mj_str, edgetpu_energy_mj_str, energy_ratio_str
# └─────────────────────────────────────────────────────────────────────────────
from mlsys.formatting import fmt, check
from mlsys.formatting import fmt_percent, fmt, check
class EdgeTPUSpeedupCalc:
"""EdgeTPU vs Cortex-M7: 7.5× inference speedup, higher peak power, lower energy per inference."""
@@ -3803,7 +3803,7 @@ Acceptable degradation depends on deployment context. A 2% accuracy drop might b
# │ mv2_ece_int8_str, mv2_edge_fp32_str, mv2_edge_int8_str,
# │ mv2_edge_drop_str
# └─────────────────────────────────────────────────────────────────────────────
from mlsys.formatting import fmt, check
from mlsys.formatting import fmt_percent, fmt, check
class MobileNetINT8Calc:
"""MobileNetV2 FP32 vs INT8: aggregate accuracy holds but calibration and edge cases degrade."""
@@ -3927,7 +3927,7 @@ Generation-specific metrics capture properties absent from discriminative benchm
# │ Exports: slow_str, fast_str, response_tokens_str, slow_toks_str,
# │ fast_toks_str
# └─────────────────────────────────────────────────────────────────────────────
from mlsys.formatting import fmt, check
from mlsys.formatting import fmt_percent, fmt, check
class LLMThroughputCalc:
"""25 vs 100 tok/s on a 750-token response: 30 s vs 7.5 s — a 4× user-perceived difference."""

View File

@@ -81,7 +81,7 @@ For decades, the dominant strategy was straightforward: more data, better models
# │ Imports: mlsys.formatting (fmt)
# │ Exports: gpu_growth_str, gpu_period_str, web_data_growth_str, etc.
# └─────────────────────────────────────────────────────────────────────────────
from mlsys.formatting import fmt, check
from mlsys.formatting import fmt_percent, fmt, check
# ┌── LEGO ───────────────────────────────────────────────
class SelectionEconomicsAnchor:
@@ -223,7 +223,7 @@ To make this concrete, consider training a model in the **GPT-2/Llama Lighthouse
# └─────────────────────────────────────────────────────────────────────────────
from mlsys.constants import Bparam, BILLION, TRILLION, SEC_PER_HOUR, MILLION, THOUSAND
from mlsys import Models
from mlsys.formatting import fmt, check
from mlsys.formatting import fmt_percent, fmt, check
# ┌── LEGO ───────────────────────────────────────────────
class ComputeDataGap:
@@ -301,7 +301,7 @@ The systems framing reveals optimization opportunities invisible to the ML frami
# │ Imports: mlsys.formatting (fmt)
# │ Exports: training_cost_m_str, dataset_reduction_pct_str, combined_factor_str
# └─────────────────────────────────────────────────────────────────────────────
from mlsys.formatting import fmt, check
from mlsys.formatting import fmt_percent, fmt, check
# ┌── LEGO ───────────────────────────────────────────────
class IronLawSavings:
@@ -544,7 +544,7 @@ To make the Information-Compute Ratio concrete, consider how coreset selection i
# │ Exports: imagenet_size_str, icr_ratio_str, coreset_pct_str, etc.
# └─────────────────────────────────────────────────────────────────────────────
from mlsys.constants import RESNET50_FLOPs, GFLOPs, IMAGENET_IMAGES
from mlsys.formatting import fmt, check
from mlsys.formatting import fmt_percent, fmt, check
class IcrCoresetComparison:
"""Compare learning-per-FLOP for random sampling vs. coreset selection."""
@@ -586,7 +586,7 @@ class IcrCoresetComparison:
acc_gain_random_str = fmt(acc_gain_random_value, precision=1, commas=False)
acc_gain_coreset_str = fmt(acc_gain_coreset_value, precision=1, commas=False)
acc_diff_str = fmt(acc_diff_value, precision=1, commas=False)
coreset_pct_str = fmt(coreset_fraction_value * 100, precision=0, commas=False)
coreset_pct_str = fmt_percent(coreset_fraction_value, precision=0, commas=False)
# ┌── EXPORTS (Bridge to Text) ─────────────────────────────────────────────────
resnet50_fwd_gflops_str = IcrCoresetComparison.resnet50_fwd_gflops_str
@@ -671,7 +671,7 @@ Why does this heterogeneity exist? The answer lies in how neural networks learn
# │ Imports: mlsys.formatting (fmt)
# │ Exports: epsilon_str, epsilon_pct_str, n_clean_str, n_noisy_str, ratio_str
# └─────────────────────────────────────────────────────────────────────────────
from mlsys.formatting import fmt, check
from mlsys.formatting import fmt_percent, fmt, check
# ┌── LEGO ───────────────────────────────────────────────
class QualityMultiplier:
@@ -697,7 +697,7 @@ class QualityMultiplier:
# ┌── 4. OUTPUT (Formatting) ──────────────────────────────────────────────
epsilon_str = fmt(epsilon, precision=2, commas=False)
epsilon_pct_str = fmt(epsilon * 100, precision=0, commas=False)
epsilon_pct_str = fmt_percent(epsilon, precision=0, commas=False)
n_clean_str = fmt(n_clean, precision=0, commas=False)
n_noisy_str = fmt(n_noisy, precision=0, commas=True)
ratio_str = fmt(ratio, precision=0, commas=False)
@@ -875,7 +875,7 @@ Given these trade-offs, most practitioners find that EL2N with a small proxy mod
# │ Imports: mlsys.formatting (fmt)
# │ Exports: n_train_images_str, coreset_fraction_pct_str, n_coreset_str, etc.
# └─────────────────────────────────────────────────────────────────────────────
from mlsys.formatting import fmt, check
from mlsys.formatting import fmt_percent, fmt, check
class CoresetPractice:
"""Practical 10× coreset workflow: 5-epoch proxy selects 100K from 1M images."""
@@ -894,7 +894,7 @@ class CoresetPractice:
# ┌── 4. OUTPUT (Formatting) ─────────────────────────────────────────────
n_train_images_str = fmt(n_train_images_value / MILLION, precision=0) + " million"
coreset_fraction_pct_str = fmt(coreset_fraction_value * 100, precision=0, commas=False)
coreset_fraction_pct_str = fmt_percent(coreset_fraction_value, precision=0, commas=False)
n_coreset_str = fmt(n_coreset_value, precision=0, commas=True)
n_epochs_proxy_str = fmt(n_epochs_proxy_value, precision=0, commas=False)
@@ -1061,7 +1061,7 @@ From a systems perspective, curriculum learning improves convergence by reducing
# │ Imports: mlsys.formatting (fmt)
# │ Exports: cifar10_speedup_str, imagenet_speedup_str, mentornet_speedup_str
# └─────────────────────────────────────────────────────────────────────────────
from mlsys.formatting import fmt, check
from mlsys.formatting import fmt_percent, fmt, check
class CurriculumBenchmarks:
"""Curriculum learning convergence speedups across CIFAR-10, CIFAR-100, ImageNet, MentorNet."""
@@ -1214,7 +1214,7 @@ The economic implications are substantial. In production settings, labeling cost
# │ Imports: mlsys.formatting (fmt)
# │ Exports: n_unlabeled_str, cost_saving_str, speedup_str, etc.
# └─────────────────────────────────────────────────────────────────────────────
from mlsys.formatting import fmt, check
from mlsys.formatting import fmt_percent, fmt, check
class ActiveLearningRoi:
"""Medical imaging active learning: 20× speedup, $4.75M savings vs. naive labeling."""
@@ -1385,7 +1385,7 @@ The systems trade-off in semi-supervised learning is straightforward: it typical
# │ Imports: mlsys.formatting (fmt)
# │ Exports: cifar10_fixmatch_*_str, acc_loss_str, cost_reduction_str, etc.
# └─────────────────────────────────────────────────────────────────────────────
from mlsys.formatting import fmt, check
from mlsys.formatting import fmt_percent, fmt, check
class FixmatchLabelEfficiency:
"""FixMatch CIFAR-10: 200× label reduction for ~8× total cost savings."""
@@ -1551,7 +1551,7 @@ To illustrate this economic transformation, consider a company building ten spec
# │ Imports: mlsys.formatting (fmt)
# │ Exports: label_cost_drop_str, marginal_compute_reduction_str, etc.
# └─────────────────────────────────────────────────────────────────────────────
from mlsys.formatting import fmt, check
from mlsys.formatting import fmt_percent, fmt, check
class FoundationCostAmortization:
"""Foundation model amortization: 10 tasks, 100× label reduction, 20× marginal compute drop."""
@@ -2105,7 +2105,7 @@ Here $T_{selection}$ is the time spent scoring the pool and $T_{train}$ is the c
# │ Imports: mlsys.formatting (fmt)
# │ Exports: score_a_str, savings_b_pct_str, trap_pct_str, etc.
# └─────────────────────────────────────────────────────────────────────────────
from mlsys.formatting import fmt, check
from mlsys.formatting import fmt_percent, fmt, check
class SelectionInequalityCalc:
"""1M image scenario: proxy scoring (0.6 hrs) preserves 90% compute savings vs full-model scoring."""
@@ -2232,7 +2232,7 @@ The following analysis formalizes the 10% heuristic as *the selection inequality
# │ Imports: mlsys.formatting (fmt)
# │ Exports: n_epochs_full_str, speedup_efficient_str, cost_total_iterative_str
# └─────────────────────────────────────────────────────────────────────────────
from mlsys.formatting import fmt, check
from mlsys.formatting import fmt_percent, fmt, check
class SelectionInequalityMath:
"""Epoch-normalized selection inequality: one-shot (9× speedup) vs. iterative (slower than baseline)."""
@@ -2260,7 +2260,7 @@ class SelectionInequalityMath:
# ┌── 4. OUTPUT (Formatting) ─────────────────────────────────────────────
n_epochs_full_str = fmt(n_epochs_full, precision=0, commas=False)
subset_fraction_pct_str = fmt(subset_fraction * 100, precision=0, commas=False)
subset_fraction_pct_str = fmt_percent(subset_fraction, precision=0, commas=False)
cost_selection_full_str = fmt(cost_selection_full, precision=0, commas=False)
n_epochs_subset_str = fmt(n_epochs_subset, precision=0, commas=False)
cost_total_efficient_str = fmt(cost_total_efficient, precision=0, commas=False)
@@ -2424,7 +2424,7 @@ If $R > 1$ (data pipeline is the bottleneck), set echo factor $e \leq R$ to full
# │ Imports: mlsys.formatting (fmt)
# │ Exports: pipeline_throughput_str, pipeline_ratio_str, idle_pct_str, echo_hrs_str
# └─────────────────────────────────────────────────────────────────────────────
from mlsys.formatting import fmt, check
from mlsys.formatting import fmt_percent, fmt, check
class DataEchoingRoi:
"""ImageNet heavy augmentation: echo factor 2 cuts training from 107 hrs to 53 hrs."""
@@ -2561,7 +2561,7 @@ For a concrete example, consider training a vision model:
# │ Imports: mlsys.formatting (fmt)
# │ Exports: c_raw_str, c_label_str, c_total_str, p_data_str, p_compute_str
# └─────────────────────────────────────────────────────────────────────────────
from mlsys.formatting import fmt, check
from mlsys.formatting import fmt_percent, fmt, check
class CostBreakdown:
"""ImageNet-scale training cost breakdown: data costs (~81%) dominate compute (~19%)."""
@@ -2671,7 +2671,7 @@ ROI calculations assume that techniques deliver their promised benefits, but act
# │ Imports: mlsys.formatting (fmt)
# │ Exports: cost_random_total_str, cost_active_total_str, roi_pct_str, be_n_random_str, be_n_active_str
# └─────────────────────────────────────────────────────────────────────────────
from mlsys.formatting import fmt, check
from mlsys.formatting import fmt_percent, fmt, check
class BreakevenCalc:
"""Active learning break-even: 2K labels + $500 inference achieves same accuracy as 5K random labels."""
@@ -2768,7 +2768,7 @@ $$
# │ Imports: mlsys.formatting (fmt)
# │ Exports: cost_build_str, savings_per_run_str, roi_1_str, roi_50_str
# └─────────────────────────────────────────────────────────────────────────────
from mlsys.formatting import fmt, check
from mlsys.formatting import fmt_percent, fmt, check
class DeduplicationAmortization:
"""Deduplication pipeline ROI: negative at 1 run, highly profitable at 50 runs."""
@@ -3373,7 +3373,7 @@ Data selection involves counterintuitive diminishing returns that contradict the
# │ Imports: mlsys.formatting (fmt)
# │ Exports: Various formatted strings for inline use
# └─────────────────────────────────────────────────────────────────────────────
from mlsys.formatting import fmt, check
from mlsys.formatting import fmt_percent, fmt, check
class FpScalingCalc:
"""Quantitative backing for all Fallacies and Pitfalls in the F&P section."""

View File

@@ -186,7 +186,7 @@ To see Amdahl's Law in action, consider how the parallel fraction $p$ differs dr
# │ Imports: mlsys.constants (H100_FLOPS_INT8), mlsys.formatting (fmt)
# │ Exports: amdahl_*_str, hw_speedup_str, h100_tflops_int8
# └─────────────────────────────────────────────────────────────────────────────
from mlsys.formatting import fmt, check
from mlsys.formatting import fmt_percent, fmt, check
from mlsys.constants import (
H100_FLOPS_INT8, TFLOPs, second,
BILLION, MILLION, TRILLION, THOUSAND
@@ -232,9 +232,9 @@ class AmdahlH100:
# ResNet
p_resnet_str = fmt(p_resnet, precision=2, commas=False)
p_resnet_pct_str = fmt(p_resnet*100, precision=0, commas=False)
p_resnet_pct_str = fmt_percent(p_resnet, precision=0, commas=False)
serial_resnet_str = fmt(1-p_resnet, precision=2, commas=False)
serial_resnet_pct_str = fmt((1-p_resnet)*100, precision=0, commas=False)
serial_resnet_pct_str = fmt_percent((1-p_resnet), precision=0, commas=False)
p_resnet_per_s_str = fmt(p_resnet / hw_speedup_factor, precision=4, commas=False)
amdahl_resnet_str = fmt(speedup_resnet, precision=1, commas=False)
amdahl_resnet_round_str = fmt(speedup_resnet, precision=0, commas=False)
@@ -242,7 +242,7 @@ class AmdahlH100:
# GPT-2
p_gpt2_str = fmt(p_gpt2, precision=2, commas=False)
serial_gpt2_str = fmt(1-p_gpt2, precision=2, commas=False)
serial_gpt2_pct_str = fmt((1-p_gpt2)*100, precision=0, commas=False)
serial_gpt2_pct_str = fmt_percent((1-p_gpt2), precision=0, commas=False)
p_gpt2_per_s_str = fmt(p_gpt2 / hw_speedup_factor, precision=4, commas=False)
amdahl_gpt2_str = fmt(speedup_gpt2, precision=1, commas=False)
amdahl_gpt2_ceil_str = fmt(ceiling_gpt2, precision=0, commas=False)
@@ -604,7 +604,7 @@ from mlsys.constants import (
MOBILE_NPU_TOPS_INT8, TFLOPs, second,
KIB_TO_BYTES, MIB_TO_BYTES
)
from mlsys.formatting import fmt
from mlsys.formatting import fmt_percent, fmt
class CpuMlInefficiency:
"""CPU vs accelerator efficiency gap for ML workloads."""
@@ -1045,7 +1045,7 @@ output = relu(Z) # Vector: applies activation to
# │ Imports: mlsys.formatting (fmt)
# │ Exports: wm_in_str, wm_out_str, wm_params_str
# └─────────────────────────────────────────────────────────────────────────────
from mlsys.formatting import fmt
from mlsys.formatting import fmt_percent, fmt
class WeightMatrixCalc:
"""Weight matrix parameter count for a single linear layer."""
@@ -1586,7 +1586,7 @@ While tensor cores package matrix operations into structured computational units
# │ Imports: mlsys.constants (ENERGY_DRAM_ACCESS_PJ, SYSTOLIC_ARRAY_DIM)
# │ Exports: energy_ratio_str, vector_energy_str, systolic_energy_str
# └─────────────────────────────────────────────────────────────────────────────
from mlsys.formatting import fmt
from mlsys.formatting import fmt_percent, fmt
from mlsys.constants import ENERGY_DRAM_ACCESS_PJ, SYSTOLIC_ARRAY_DIM
# ┌── LEGO ───────────────────────────────────────────────
@@ -1796,7 +1796,7 @@ node[right]{Data};
# │ Exports: layer_dim_str, array_dim_str, tile_count_str, reuse_factor_str
# └─────────────────────────────────────────────────────────────────────────────
from mlsys.constants import SYSTOLIC_ARRAY_DIM
from mlsys.formatting import fmt, check
from mlsys.formatting import fmt_percent, fmt, check
# ┌── LEGO ───────────────────────────────────────────────
class TilingPrinciple:
@@ -1898,7 +1898,7 @@ The underlying principle remains consistent: data flows systematically through p
# │ Imports: mlsys.constants (SYSTOLIC_ARRAY_DIM), mlsys.formatting (fmt)
# │ Exports: systolic_dim_str, systolic_ops_str
# └─────────────────────────────────────────────────────────────────────────────
from mlsys.formatting import fmt
from mlsys.formatting import fmt_percent, fmt
from mlsys.constants import SYSTOLIC_ARRAY_DIM
class SystolicOpsCalc:
@@ -2370,7 +2370,7 @@ To make these energy costs concrete, we can trace a single tensor through every
# │ dram_energy_pj_bit_str, latency_hbm_str, latency_l2_str,
# │ latency_l1_str, a100_tflops_fp16, reg_energy_pj_bit_str
# └─────────────────────────────────────────────────────────────────────────────
from mlsys.formatting import fmt
from mlsys.formatting import fmt_percent, fmt
from mlsys.constants import (
BYTES_FP16, ENERGY_DRAM_PJ_PER_BYTE,
LATENCY_HBM3, LATENCY_L2_CACHE, LATENCY_L1_REGISTER,
@@ -2950,7 +2950,7 @@ from mlsys.constants import (
H100_MEM_BW, H100_FLOPS_FP16_TENSOR,
flop, byte, TB, second
)
from mlsys.formatting import fmt
from mlsys.formatting import fmt_percent, fmt
# ┌── LEGO ───────────────────────────────────────────────
class RooflineGap:
@@ -3059,7 +3059,7 @@ To see how these intensity values translate into real performance predictions, a
# │ Imports: mlsys.constants (TRANSFORMER_*, BYTES_FP16)
# │ Exports: t_*_str, qkv_*_str, softmax_*_str
# └─────────────────────────────────────────────────────────────────────────────
from mlsys.formatting import fmt
from mlsys.formatting import fmt_percent, fmt
from mlsys.constants import (
byte, MB, flop, GFLOPs, MFLOPs,
TRANSFORMER_HIDDEN_DIM_EXAMPLE, TRANSFORMER_SEQ_LEN_EXAMPLE,
@@ -3171,7 +3171,7 @@ A *convolutional layer analysis* demonstrates how these formulas apply in practi
# │ conv_input_mb_str, conv_weights_mb_str, conv_output_mb_str,
# │ conv_total_mb_str, conv_ai_str
# └─────────────────────────────────────────────────────────────────────────────
from mlsys.formatting import fmt
from mlsys.formatting import fmt_percent, fmt
from mlsys.constants import BYTES_FP16, byte, MB, flop, GFLOPs
class Conv2dAnalysisCalc:
@@ -3266,7 +3266,7 @@ However, not all layers in a neural network exhibit this favorable profile. The
# │ dense_output_kb_str, dense_total_mb_str, dense_ai_str,
# │ dense_attainable_str, dense_util_pct_str, a100_bw
# └─────────────────────────────────────────────────────────────────────────────
from mlsys.formatting import fmt
from mlsys.formatting import fmt_percent, fmt
from mlsys.constants import (
BYTES_FP16, byte, MB, KiB, flop, MFLOPs, GFLOPs, TFLOPs,
A100_MEM_BW, A100_FLOPS_FP16_TENSOR, GB, second,
@@ -3365,7 +3365,7 @@ The situation becomes even more extreme for element-wise operations like normali
# │ ln_params_kb_str, ln_output_mb_str, ln_total_mb_str,
# │ ln_ai_str, ln_attainable_str, a100_bw_str
# └─────────────────────────────────────────────────────────────────────────────
from mlsys.formatting import fmt
from mlsys.formatting import fmt_percent, fmt
from mlsys.constants import byte, MB, A100_MEM_BW, GB, GFLOPs, TFLOPs, second
class LayernormAnalysisCalc:
@@ -3500,7 +3500,7 @@ For workloads where batching is impractical, such as interactive LLM generation
# │ Imports: mlsys.constants (GPT2_PARAMS, A100_*, BYTES_FP16)
# │ Exports: gpt2_*_str, a100_tflops_fp32
# └─────────────────────────────────────────────────────────────────────────────
from mlsys.formatting import fmt
from mlsys.formatting import fmt_percent, fmt
from mlsys.formulas import model_memory
from mlsys.constants import (
GB, GPT2_PARAMS, BYTES_FP16, flop, GFLOPs,
@@ -4004,7 +4004,7 @@ Building on software optimization techniques from @sec-model-compression and mem
# │ Imports: mlsys.constants (BYTES_FP32, byte, MB), mlsys.formatting (fmt)
# │ Exports: tensor_mb_str, total_mb_str, footprint_ratio_str
# └─────────────────────────────────────────────────────────────────────────────
from mlsys.formatting import fmt
from mlsys.formatting import fmt_percent, fmt
from mlsys.constants import BYTES_FP32, byte, MB
class MemoryFootprintCalc:
@@ -4075,7 +4075,7 @@ Each operation produces an intermediate tensor that must be written to memory an
# │ Imports: mlsys.constants (BYTES_FP32), mlsys.formatting (fmt)
# │ Exports: tensor_mb_str, total_mb_str
# └─────────────────────────────────────────────────────────────────────────────
from mlsys.formatting import fmt
from mlsys.formatting import fmt_percent, fmt
from mlsys.constants import BYTES_FP32
class MemoryFootprintTableCalc:
@@ -4474,7 +4474,7 @@ Machine learning compilers automate the translation of dataflow strategies into
# │ Imports: mlsys.formatting (fmt)
# │ Exports: naive_inference_ms, optimized_inference_ms, compiler_speedup_str
# └─────────────────────────────────────────────────────────────────────────────
from mlsys.formatting import fmt
from mlsys.formatting import fmt_percent, fmt
class CompilerSpeedupCalc:
"""ResNet-50 latency improvement from ML compiler graph and memory optimizations."""
@@ -4913,7 +4913,7 @@ Engineers assume specialized accelerators automatically outperform general-purpo
# │ Exports: fp_ridge_example_str, fp_layernorm_tflops_str,
# │ fp_layernorm_util_str
# └─────────────────────────────────────────────────────────────────────────────
from mlsys.formatting import fmt
from mlsys.formatting import fmt_percent, fmt
from mlsys.constants import ENERGY_DRAM_ACCESS_PJ, ENERGY_SRAM_L1_PJ
class FpMemoryEnergyCalc:
@@ -4965,7 +4965,7 @@ Practitioners focus on peak TFLOPS without analyzing whether their workloads can
# │ mlsys.formatting (fmt)
# │ Exports: fp_nvlink_bw_str, fp_sync_time_str, fp_sync_overhead_str
# └─────────────────────────────────────────────────────────────────────────────
from mlsys.formatting import fmt
from mlsys.formatting import fmt_percent, fmt
from mlsys.constants import NVLINK_A100_BW, GB, second
class FpMultigpuScalingCalc:
@@ -5017,7 +5017,7 @@ Vendors advertise peak FLOPS as the definitive measure of accelerator capability
# │ Exports: fp_ai_b1_str, fp_ai_b256_str, fp_t4_ridge_str,
# │ fp_t4_flops_str, fp_t4_bw_str
# └─────────────────────────────────────────────────────────────────────────────
from mlsys.formatting import fmt
from mlsys.formatting import fmt_percent, fmt
from mlsys.constants import T4_FLOPS_FP16_TENSOR, T4_MEM_BW, TFLOPs, second, GB
class FpSmallBatchCalc:
@@ -5084,7 +5084,7 @@ Organizations optimize exclusively for specific vendors to maximize performance
# │ mlsys.constants (GB, BYTES_FP16)
# │ Exports: headroom_str, token_latency_ms_str, frame_budget_str
# └─────────────────────────────────────────────────────────────────────────────
from mlsys.formatting import fmt, check
from mlsys.formatting import fmt_percent, fmt, check
from mlsys.formulas import model_memory
from mlsys.constants import GB, BYTES_FP16
@@ -5186,7 +5186,7 @@ Beyond raw performance, we must evaluate hardware through the lens of *silicon s
# │ cpu_energy_day_str, npu_energy_day_str, carbon_intensity_str,
# │ co2_saved_str
# └─────────────────────────────────────────────────────────────────────────────
from mlsys.formatting import fmt, check
from mlsys.formatting import fmt_percent, fmt, check
from mlsys.constants import DAYS_PER_YEAR
class CarbonRoiCalc:

View File

@@ -91,7 +91,7 @@ from mlsys.constants import (
second,
BILLION, MILLION, TRILLION, THOUSAND
)
from mlsys.formatting import fmt, check
from mlsys.formatting import fmt_percent, fmt, check
# ┌── LEGO ───────────────────────────────────────────────
class AIMomentStats:
@@ -207,7 +207,7 @@ from mlsys.constants import (
COLOR_DEPTH_8BIT,
IMAGENET_TEST_IMAGES,
)
from mlsys.formatting import fmt, check
from mlsys.formatting import fmt_percent, fmt, check
# ┌── LEGO ───────────────────────────────────────────────
class VerificationGap:
@@ -561,7 +561,7 @@ This hybrid approach combined human-engineered features with statistical learnin
# └─────────────────────────────────────────────────────────────────────────────
from mlsys import Models
from mlsys.constants import MILLION
from mlsys.formatting import fmt, check
from mlsys.formatting import fmt_percent, fmt, check
# ┌── LEGO ───────────────────────────────────────────────
class AlexNetBreakthrough:
@@ -1036,7 +1036,7 @@ node[below]{dense}(X1-|B1D.north west);
# └─────────────────────────────────────────────────────────────────────────────
from mlsys import Hardware, Models
from mlsys.constants import Bparam, ZFLOPs, byte, GB
from mlsys.formatting import fmt, check
from mlsys.formatting import fmt_percent, fmt, check
# ┌── LEGO ───────────────────────────────────────────────
class GPT3Scale:
@@ -1116,7 +1116,7 @@ The shift from expert systems to statistical learning to deep learning has drama
# └─────────────────────────────────────────────────────────────────────────────
from mlsys import Models
from mlsys.constants import MILLION
from mlsys.formatting import fmt, check
from mlsys.formatting import fmt_percent, fmt, check
# ┌── LEGO ───────────────────────────────────────────────
class GPT4Scale:
@@ -1201,7 +1201,7 @@ Rather than beginning with an abstract definition, consider a system most people
# │ Exports: gmail_emails_t_str
# └─────────────────────────────────────────────────────────────────────────────
from mlsys.constants import GMAIL_EMAILS_PER_DAY, TRILLION
from mlsys.formatting import fmt, check
from mlsys.formatting import fmt_percent, fmt, check
# ┌── LEGO ───────────────────────────────────────────────
class EmailScale:
@@ -1537,7 +1537,7 @@ from mlsys.constants import (
TRILLION, SEC_PER_DAY, flop
)
from mlsys.formulas import dTime
from mlsys.formatting import fmt, md_math
from mlsys.formatting import fmt_percent, fmt, md_math
# --- Inputs (cluster configuration) ---
num_gpus_value = 1024
@@ -1597,8 +1597,8 @@ class GPT3Training:
# ┌── 4. OUTPUT (Formatting) ──────────────────────────────────────────────
# Text strings
num_gpus_str = fmt(num_gpus, precision=0, commas=False)
eta_base_pct_str = fmt(eta_base * 100, precision=0, commas=False)
eta_opt_pct_str = fmt(eta_opt * 100, precision=0, commas=False)
eta_base_pct_str = fmt_percent(eta_base, precision=0, commas=False)
eta_opt_pct_str = fmt_percent(eta_opt, precision=0, commas=False)
days_initial_str = fmt(days_base, precision=0, commas=False)
days_optimized_str = fmt(days_opt, precision=0, commas=False)
days_saved_str = fmt(days_saved, precision=0, commas=False)
@@ -1726,7 +1726,7 @@ Each archetype manifests different constraints along the D·A·M axes, ensuring
# │ Exports: imagenet_images_m_str
# └─────────────────────────────────────────────────────────────────────────────
from mlsys.constants import IMAGENET_IMAGES, MILLION
from mlsys.formatting import fmt, check
from mlsys.formatting import fmt_percent, fmt, check
# ┌── LEGO ───────────────────────────────────────────────
class ImageNetStats:
@@ -2173,7 +2173,7 @@ fill=OrangeL,draw=OrangeLine](DB2){Model\ Monitoring};
# │ Exports: scenario_* strings for mission and constraints.
# └─────────────────────────────────────────────────────────────────────────────
from mlsys import Applications
from mlsys.formatting import fmt
from mlsys.formatting import fmt_percent, fmt
# ┌── LEGO ───────────────────────────────────────────────
class ScenarioRegistry:
@@ -2296,7 +2296,7 @@ The interdependencies across the D·A·M axes create specific challenge categori
# │ Exports: waymo_data_low_str, waymo_data_high_str
# └─────────────────────────────────────────────────────────────────────────────
from mlsys.constants import WAYMO_DATA_PER_HOUR_LOW, WAYMO_DATA_PER_HOUR_HIGH, TB, hour
from mlsys.formatting import fmt, check
from mlsys.formatting import fmt_percent, fmt, check
# ┌── LEGO ───────────────────────────────────────────────
class WaymoStats:
@@ -2438,7 +2438,7 @@ Engineers assume benchmark performance predicts production accuracy, but distrib
# │ total_ms, new_total_ms, improv_pct, naive_p
# └─────────────────────────────────────────────────────────────────────────────
from mlsys.formulas import calc_amdahls_speedup
from mlsys.formatting import fmt, check
from mlsys.formatting import fmt_percent, fmt, check
# --- Inputs (hypothetical pipeline timings) ---
t_inference_value = 45 # ms
@@ -2525,7 +2525,7 @@ Engineers optimize inference latency in isolation, but **Amdahl's Law** governs
# │ Imports: mlsys.formatting (fmt)
# │ Exports: acc_initial_str, acc_final_str, acc_drop_str, months_str
# └─────────────────────────────────────────────────────────────────────────────
from mlsys.formatting import fmt, check
from mlsys.formatting import fmt_percent, fmt, check
# ┌── LEGO ───────────────────────────────────────────────
class DriftFallacy:
@@ -2611,7 +2611,7 @@ This book makes a stronger claim: ML systems engineering is not merely a collect
# └─────────────────────────────────────────────────────────────────────────────
from mlsys import Systems
from mlsys.constants import GB, MB, KiB, watt, milliwatt, TFLOPs, second, flop
from mlsys.formatting import fmt, check
from mlsys.formatting import fmt_percent, fmt, check
# ┌── LEGO ───────────────────────────────────────────────
class DeploymentSystems:

View File

@@ -88,7 +88,7 @@ from mlsys.constants import (
MOBILE_LATENCY_RANGE_MS, TINY_LATENCY_RANGE_MS,
MOBILE_RAM_RANGE_GB, MOBILE_STORAGE_RANGE, MOBILE_TDP_RANGE_W
)
from mlsys.formatting import fmt, check
from mlsys.formatting import fmt_percent, fmt, check
# ┌── LEGO ───────────────────────────────────────────────
class MLSystemsSetup:
@@ -443,7 +443,7 @@ $$\text{Latency}_{\min} = \frac{2 \times \text{Distance}}{c_{\text{fiber}}} \app
# │ Exports: min_latency_str, distance_str
# └─────────────────────────────────────────────────────────────────────────────
from mlsys.constants import SPEED_OF_LIGHT_FIBER_KM_S, ureg
from mlsys.formatting import fmt, check
from mlsys.formatting import fmt_percent, fmt, check
# ┌── LEGO ───────────────────────────────────────────────
class LightLatency:
@@ -503,7 +503,7 @@ Doubling clock frequency required approximately 8$\times$ more power. The breakd
# │ Imports: mlsys.formatting (fmt)
# │ Exports: compute_growth_str, mem_bw_growth_str, mem_wall_ratio_str
# └─────────────────────────────────────────────────────────────────────────────
from mlsys.formatting import fmt, check
from mlsys.formatting import fmt_percent, fmt, check
# ┌── LEGO ───────────────────────────────────────────────
class MemoryWall:
@@ -592,7 +592,7 @@ This principle dictates that if your system is **Memory Bound**\index{memory-bou
# │ Imports: mlsys.formatting (fmt)
# │ Exports: et_*_str variables for callout
# └─────────────────────────────────────────────────────────────────────────────
from mlsys.formatting import fmt, check
from mlsys.formatting import fmt_percent, fmt, check
# ┌── LEGO ───────────────────────────────────────────────
class EnergyTransmission:
@@ -733,7 +733,7 @@ from mlsys import Models
from mlsys.constants import (
RESNET50_FLOPs, GFLOPs, Mparam, Bparam, Kparam, byte, MB, GB, KB
)
from mlsys.formatting import fmt, check
from mlsys.formatting import fmt_percent, fmt, check
# ┌── LEGO ───────────────────────────────────────────────
class LighthouseModels:
@@ -977,7 +977,7 @@ The following worked example demonstrates how to apply this analysis quantitativ
# └─────────────────────────────────────────────────────────────────────────────
from mlsys.constants import RESNET50_FLOPs, RESNET50_PARAMS, GFLOPs, Mparam, byte, MB
from mlsys.formatting import fmt, check
from mlsys.formatting import fmt_percent, fmt, check
# ┌── LEGO ───────────────────────────────────────────────
class ResnetSetup:
@@ -1031,7 +1031,7 @@ from mlsys.constants import (
TFLOPs, second, TB, byte, flop,
)
from mlsys.formulas import calc_bottleneck
from mlsys.formatting import sci, fmt, sci_latex, md_frac
from mlsys.formatting import sci, fmt_percent, fmt, sci_latex, md_frac
# ┌── LEGO ───────────────────────────────────────────────
class ResnetCloud:
@@ -1108,7 +1108,7 @@ from mlsys.constants import (
TFLOPs, second, GB, byte, flop,
)
from mlsys.formulas import calc_bottleneck
from mlsys.formatting import sci_latex, md_frac, fmt
from mlsys.formatting import sci_latex, md_frac, fmt_percent, fmt
# ┌── LEGO ───────────────────────────────────────────────
class ResnetMobile:
@@ -1236,7 +1236,7 @@ from mlsys.constants import (
ESP32_RAM, ESP32_FLASH, ESP32_POWER_MIN, ESP32_POWER_MAX, ESP32_PRICE,
TB, GB, KiB, MB, watt, USD
)
from mlsys.formatting import fmt, check
from mlsys.formatting import fmt_percent, fmt, check
# ┌── LEGO ───────────────────────────────────────────────
class HardwareSpectrumSetup:
@@ -1440,7 +1440,7 @@ above=1of $(B2.north east)!0.5!(B3.north west)$](B0){Cloud ML};
from mlsys.constants import SPEED_OF_LIGHT_FIBER_KM_S
from mlsys.formulas import calc_network_latency_ms
from mlsys.formatting import fmt, check
from mlsys.formatting import fmt_percent, fmt, check
# ┌── LEGO ───────────────────────────────────────────────
class DistancePenalty:
@@ -1544,7 +1544,7 @@ from mlsys.constants import (
CLOUD_ELECTRICITY_PER_KWH, USD, GB, watt, ureg,
MILLION, MIB_TO_BYTES,
)
from mlsys.formatting import fmt, check
from mlsys.formatting import fmt_percent, fmt, check
# ┌── LEGO ───────────────────────────────────────────────
class CloudEdgeTCO:
@@ -1739,7 +1739,7 @@ from mlsys.constants import (
BILLION, TRILLION, SEC_PER_HOUR, HOURS_PER_DAY,
BITS_PER_BYTE, KIB_TO_BYTES, MIB_TO_BYTES, MS_PER_SEC
)
from mlsys.formatting import fmt, check
from mlsys.formatting import fmt_percent, fmt, check
# ┌── LEGO ───────────────────────────────────────────────
class VoiceAssistantWall:
@@ -1973,7 +1973,7 @@ The benefits of lower bandwidth usage and reduced latency become stark when we e
# └─────────────────────────────────────────────────────────────────────────────
from mlsys import Hardware
from mlsys.formulas import calc_monthly_egress_cost
from mlsys.formatting import fmt, check
from mlsys.formatting import fmt_percent, fmt, check
from mlsys.constants import (
VIDEO_1080P_WIDTH, VIDEO_1080P_HEIGHT, VIDEO_BYTES_PER_PIXEL_RGB,
VIDEO_FPS_STANDARD, CLOUD_EGRESS_PER_GB, MB, GB, second, MILLION,
@@ -2179,7 +2179,7 @@ To make these trade-offs concrete, the following worked example applies *edge in
from mlsys import Hardware, Models
from mlsys.constants import GFLOPs, CLOUD_ELECTRICITY_PER_KWH, HOURS_PER_YEAR, TFLOPs, USD, watt, ureg
from mlsys.formulas import calc_fleet_tco
from mlsys.formatting import fmt, check
from mlsys.formatting import fmt_percent, fmt, check
# ┌── LEGO ───────────────────────────────────────────────
class EdgeSizing:
@@ -2481,7 +2481,7 @@ The battery life and resource constraints listed above translate directly into e
# └─────────────────────────────────────────────────────────────────────────────
from mlsys import Hardware
from mlsys.constants import OBJECT_DETECTOR_POWER_W, ureg
from mlsys.formatting import md_frac, fmt
from mlsys.formatting import md_frac, fmt_percent, fmt
# ┌── LEGO ───────────────────────────────────────────────
class BatteryTax:
@@ -2549,7 +2549,7 @@ The battery constraint limits total energy consumption over time. However, even
# │ Exports: baseline_str, quant_power_str, quant_red_str
# └─────────────────────────────────────────────────────────────────────────────
from mlsys.formatting import fmt, check
from mlsys.formatting import fmt_percent, fmt, check
# ┌── LEGO ───────────────────────────────────────────────
class ThermalQuantCalc:
@@ -2675,7 +2675,7 @@ from mlsys.constants import (
BATTERY_CAPACITY_MAH, BATTERY_VOLTAGE_V, BATTERY_ENERGY_J,
ENERGY_MOBILENET_INF_MJ, ureg, BILLION
)
from mlsys.formatting import fmt, check
from mlsys.formatting import fmt_percent, fmt, check
# ┌── LEGO ───────────────────────────────────────────────
class EnergyInference:
@@ -3503,7 +3503,7 @@ A related misconception holds that moving computation closer to the user always
# │ low_power_frac, high_power_frac
# └─────────────────────────────────────────────────────────────────────────────
from mlsys.formatting import fmt, check, md_frac
from mlsys.formatting import fmt_percent, fmt, check, md_frac
# ┌── LEGO ───────────────────────────────────────────────
class MobilePowerFallacyCalc:
@@ -3561,7 +3561,7 @@ The difference is qualitative, not just quantitative. As @sec-ml-systems-tinyml-
# │ edge_reliability_str, edge_total_str, tco_ratio_str
# └─────────────────────────────────────────────────────────────────────────────
from mlsys.formatting import fmt, check
from mlsys.formatting import fmt_percent, fmt, check
# ┌── LEGO ───────────────────────────────────────────────
class TcoPitfallCalc:
@@ -3620,7 +3620,7 @@ Teams optimize per-unit resource consumption while ignoring operational overhead
# │ Exports: cam_*_str variables for prose
# └─────────────────────────────────────────────────────────────────────────────
from mlsys.formatting import fmt, check
from mlsys.formatting import fmt_percent, fmt, check
# ┌── LEGO ───────────────────────────────────────────────
class AmdahlCameraCalc:
@@ -3649,8 +3649,8 @@ class AmdahlCameraCalc:
cam_ml_str = fmt(cam_ml_ms_value, precision=0, commas=False) # "60"
cam_post_str = fmt(cam_post_ms_value, precision=0, commas=False) # "40"
cam_total_str = fmt(cam_total_ms_value, precision=0, commas=False) # "200"
cam_ml_pct_str = fmt(cam_ml_frac_value * 100, precision=0, commas=False) # "30"
cam_non_ml_pct_str = fmt(cam_non_ml_frac_value * 100, precision=0, commas=False) # "70"
cam_ml_pct_str = fmt_percent(cam_ml_frac_value, precision=0, commas=False) # "30"
cam_non_ml_pct_str = fmt_percent(cam_non_ml_frac_value, precision=0, commas=False) # "70"
cam_speedup_10x_str = fmt(cam_speedup_10x_value, precision=2, commas=False) # "1.37"
cam_speedup_inf_str = fmt(cam_speedup_inf_value, precision=2, commas=False) # "1.43"
cam_ml_opt_str = fmt(cam_ml_optimized_ms_value, precision=0, commas=False) # "6"

View File

@@ -93,7 +93,7 @@ Inference computes a single forward pass: data flows through the network, a pred
# └─────────────────────────────────────────────────────────────────────────────
from mlsys import Hardware, Models
from mlsys.constants import *
from mlsys.formatting import fmt, sci, md_math, check
from mlsys.formatting import fmt_percent, fmt, sci, md_math, check
from mlsys.formulas import model_memory
# ┌── LEGO ───────────────────────────────────────────────
@@ -700,7 +700,7 @@ To illustrate the scale of these operations concretely, consider the *attention
# └─────────────────────────────────────────────────────────────────────────────
from mlsys import Hardware, Models
from mlsys.constants import TFLOPs, second, GPT2_HIDDEN_DIM, GPT2_LAYERS
from mlsys.formatting import fmt, check
from mlsys.formatting import fmt_percent, fmt, check
# ┌── LEGO ───────────────────────────────────────────────
class GPT2Compute:
@@ -1033,7 +1033,7 @@ $$\begin{aligned}
# │ resnet50_param_mem_b32_mb_str, resnet50_act_mem_b64_gb_str,
# │ resnet50_grad_mem_b64_gb_str
# └─────────────────────────────────────────────────────────────────────────────
from mlsys.formatting import fmt, check
from mlsys.formatting import fmt_percent, fmt, check
# ┌── LEGO ───────────────────────────────────────────────
class ResNetBatchMemory:
@@ -1121,7 +1121,7 @@ v_t = \beta_2 v_{t-1} + (1-\beta_2)\big(\nabla \mathcal{L}(\theta_t)\big)^2
# │ Exports: adam_overhead_str
# └─────────────────────────────────────────────────────────────────────────────
from mlsys.constants import BYTES_FP32, byte, Mparam
from mlsys.formatting import fmt, check
from mlsys.formatting import fmt_percent, fmt, check
# ┌── LEGO ───────────────────────────────────────────────
class AdamMemory:
@@ -1187,7 +1187,7 @@ The choice of optimization algorithm creates specific patterns of computation an
# └─────────────────────────────────────────────────────────────────────────────
from mlsys import Hardware, Models
from mlsys.constants import BYTES_FP32, BYTES_FP16, GB, GiB
from mlsys.formatting import fmt, check
from mlsys.formatting import fmt_percent, fmt, check
from mlsys.formulas import model_memory
# ┌── LEGO ───────────────────────────────────────────────
@@ -1438,7 +1438,7 @@ Here, we shift focus from *what* backpropagation computes to *what it costs* to
# └─────────────────────────────────────────────────────────────────────────────
from mlsys import Hardware, Models
from mlsys.constants import BYTES_FP16, BYTES_ADAM_STATE, GB, MB, GiB, GPT2_HIDDEN_DIM, GPT2_LAYERS
from mlsys.formatting import fmt, check
from mlsys.formatting import fmt_percent, fmt, check
from mlsys.formulas import model_memory
# ┌── LEGO ───────────────────────────────────────────────
@@ -1729,7 +1729,7 @@ plt.show()
# │ a100_bw_tbs_str, a100_ridge_str, h100_tflops_fp16_str,
# │ h100_bw_tbs_str, h100_ridge_str — all defined in training-setup.
# └─────────────────────────────────────────────────────────────────────────────
from mlsys.formatting import fmt, check
from mlsys.formatting import fmt_percent, fmt, check
# ┌── LEGO ───────────────────────────────────────────────
class AttentionIntensity:
@@ -2071,7 +2071,7 @@ Applying this throughput analysis to our GPT-2 Lighthouse Model reveals where th
# │ parallel_tokenization_ms_str
# └─────────────────────────────────────────────────────────────────────────────
from mlsys.constants import PCIE_GEN3_BW, GB, second
from mlsys.formatting import fmt, check
from mlsys.formatting import fmt_percent, fmt, check
# ┌── LEGO ───────────────────────────────────────────────
class GPT2DataPipeline:
@@ -2175,7 +2175,7 @@ While data pipeline throughput determines how fast training data reaches the GPU
# │ Exports: model_params_b_str, gradient_size_str, allreduce_str,
# │ network_time_str, network_bw_str
# └─────────────────────────────────────────────────────────────────────────────
from mlsys.formatting import fmt, check
from mlsys.formatting import fmt_percent, fmt, check
from mlsys.constants import BYTES_FP16, ALLREDUCE_FACTOR
# ┌── LEGO ───────────────────────────────────────────────
@@ -2343,7 +2343,7 @@ These hardware utilization patterns reinforce the batch-size--utilization relati
# │ vram_hidden_str, vram_layers_str, vram_activations_gb_str
# └─────────────────────────────────────────────────────────────────────────────
from mlsys.constants import BYTES_FP16, BYTES_FP32, BYTES_ADAM_STATE, byte
from mlsys.formatting import fmt, check
from mlsys.formatting import fmt_percent, fmt, check
# ┌── LEGO ───────────────────────────────────────────────
class VRAMRequirements:
@@ -2435,7 +2435,7 @@ The total memory scales linearly with batch size (as established in @eq-activati
# └─────────────────────────────────────────────────────────────────────────────
from mlsys import Hardware, Models
from mlsys.constants import BYTES_FP32, BYTES_FP16, GB, MB, Mparam, Bparam
from mlsys.formatting import fmt, check
from mlsys.formatting import fmt_percent, fmt, check
from mlsys.formulas import model_memory
# ┌── LEGO ───────────────────────────────────────────────
@@ -2676,7 +2676,7 @@ from mlsys.constants import (
BILLION, TRILLION, MILLION, THOUSAND,
TFLOPs, TRILLION, SEC_PER_DAY, SEC_PER_YEAR_LEAP, HOURS_PER_DAY
)
from mlsys.formatting import fmt, check
from mlsys.formatting import fmt_percent, fmt, check
# ┌── LEGO ───────────────────────────────────────────────
class LlamaTraining:
@@ -2742,7 +2742,7 @@ class LlamaTraining:
ub_params_b_str = fmt(params/BILLION, precision=0, commas=False)
ub_tokens_t_str = fmt(tokens/TRILLION, precision=0, commas=False)
ub_peak_tflops_str = fmt(peak_tflops, precision=0, commas=True)
ub_utilization_pct_str = fmt(utilization*100, precision=0, commas=False)
ub_utilization_pct_str = fmt_percent(utilization, precision=0, commas=False)
ub_effective_tflops_str = fmt(effective_tflops, precision=0, commas=False)
ub_num_gpus_str = f"{num_gpus:,}"
ub_rental_rate_str = fmt(rental_rate, precision=0, commas=False)
@@ -3279,7 +3279,7 @@ These benefits compound: a practitioner might simultaneously double batch size (
# │ ckpt_total_str, v100_capacity_str
# └─────────────────────────────────────────────────────────────────────────────
from mlsys.constants import GPT2_PARAMS, Mparam, Bparam, BYTES_FP32, BYTES_FP16, BYTES_ADAM_STATE, GB
from mlsys.formatting import fmt, check
from mlsys.formatting import fmt_percent, fmt, check
from mlsys.formulas import model_memory
# ┌── LEGO ───────────────────────────────────────────────
@@ -3392,7 +3392,7 @@ model_1b_fp16_gb_str = MixedPrecisionMemory.model_1b_fp16_gb_str
# │ Imports: mlsys.formatting (fmt)
# │ Exports: v100_mp_speedup_str
# └─────────────────────────────────────────────────────────────────────────────
from mlsys.formatting import fmt, check
from mlsys.formatting import fmt_percent, fmt, check
# ┌── LEGO ───────────────────────────────────────────────
class MixedPrecisionSpeedup:
@@ -3606,7 +3606,7 @@ Optimal mixed-precision training requires matching the precision format to hardw
# │ Imports: mlsys.formatting (fmt)
# │ Exports: v100_fp16_speedup_str, a100_over_v100_str, h100_over_v100_str
# └─────────────────────────────────────────────────────────────────────────────
from mlsys.formatting import fmt, check
from mlsys.formatting import fmt_percent, fmt, check
# ┌── LEGO ───────────────────────────────────────────────
class CrossGenPrecisionCalc:
@@ -3675,7 +3675,7 @@ $$ \text{Attention}(Q, K, V) = \text{softmax}\left(\frac{QK^T}{\sqrt{d_k}}\right
# │ attn_matrix_mb_str, total_attn_gb_str
# └─────────────────────────────────────────────────────────────────────────────
from mlsys.constants import BYTES_FP32, MB, GB, byte
from mlsys.formatting import fmt, check
from mlsys.formatting import fmt_percent, fmt, check
# ┌── LEGO ───────────────────────────────────────────────
class AttentionMemoryCalc:
@@ -3773,7 +3773,7 @@ Flash Attention achieves asymptotic improvements in both memory footprint and me
# │ Exports: fa_standard_mb_str, fa_flash_mb_str, fa_reduction_str
# └─────────────────────────────────────────────────────────────────────────────
from mlsys.constants import BYTES_FP32, MB
from mlsys.formatting import fmt, check
from mlsys.formatting import fmt_percent, fmt, check
# ┌── LEGO ───────────────────────────────────────────────
class FlashAttentionCalc:
@@ -3891,7 +3891,7 @@ The benefits of Flash Attention become concrete when measured on real hardware.
# │ Imports: mlsys.formatting (fmt)
# │ Exports: flash_fwd_speedup_str, flash_bwd_speedup_str
# └─────────────────────────────────────────────────────────────────────────────
from mlsys.formatting import fmt, check
from mlsys.formatting import fmt_percent, fmt, check
# ┌── LEGO ───────────────────────────────────────────────
class FlashAttentionSpeedup:
@@ -4187,7 +4187,7 @@ Returning to our GPT-2 Lighthouse Model, *gradient accumulation* is essential fo
# │ accum_2wk_str, naive_2wk_str, comm_reduction_pct_str,
# │ accum_steps_str
# └─────────────────────────────────────────────────────────────────────────────
from mlsys.formatting import fmt, check
from mlsys.formatting import fmt_percent, fmt, check
# ┌── LEGO ───────────────────────────────────────────────
class GradientAccumulation:
@@ -4378,7 +4378,7 @@ To answer that question, let us walk through optimizing GPT-2 (1.5B parameters)
# │ amp_reduction_str, recompute_overhead_str, checkpoint_factor_str
# └─────────────────────────────────────────────────────────────────────────────
from mlsys.constants import GPT2_PARAMS, GPT2_LAYERS, GPT2_HIDDEN_DIM, V100_MEM_CAPACITY, GiB, BYTES_FP32, BYTES_FP16, GB, byte
from mlsys.formatting import fmt, check
from mlsys.formatting import fmt_percent, fmt, check
from mlsys.formulas import model_memory
# ┌── LEGO ───────────────────────────────────────────────
@@ -4543,7 +4543,7 @@ The GPT-2 case study demonstrates how the optimization techniques examined in th
# │ Imports: mlsys.formatting (fmt)
# │ Exports: b_param_str..b_carbon_str, o_param_str..o_carbon_str
# └─────────────────────────────────────────────────────────────────────────────
from mlsys.formatting import fmt, check
from mlsys.formatting import fmt_percent, fmt, check
# ┌── LEGO ───────────────────────────────────────────────
class GPT2SummaryCalc:
@@ -4657,7 +4657,7 @@ o_carbon_str = GPT2SummaryCalc.o_carbon_str
# │ Imports: mlsys.formatting (fmt)
# │ Exports: mem_reduction_str, energy_reduction_pct_str, time_speedup_str
# └─────────────────────────────────────────────────────────────────────────────
from mlsys.formatting import fmt, check
from mlsys.formatting import fmt_percent, fmt, check
# ┌── LEGO ───────────────────────────────────────────────
class OptimizationSummaryCalc:
@@ -5062,7 +5062,7 @@ from mlsys.constants import (
A100_TDP, watt, GPUS_PER_HOST, BILLION, TRILLION,
SEC_PER_HOUR, HOURS_PER_DAY, THOUSAND
)
from mlsys.formatting import fmt, check
from mlsys.formatting import fmt_percent, fmt, check
# ┌── LEGO ───────────────────────────────────────────────
class TrainingCarbonFootprint:
@@ -5218,7 +5218,7 @@ The journey from single-GPU optimization through multi-device parallelism reveal
# │ Exports: fp_model_20b_params_str..fp_prefetch_reduction_str (~30 vars)
# └─────────────────────────────────────────────────────────────────────────────
from mlsys.constants import *
from mlsys.formatting import fmt, check
from mlsys.formatting import fmt_percent, fmt, check
class FallaciesPitfallsSetup:
"""Quantitative values for all Fallacies and Pitfalls examples."""

View File

@@ -80,6 +80,13 @@ def fmt_plain(val, unit_str=""):
"""Format a plain scalar with an explicit unit string."""
return f"{val:g}", unit_str
# --- H100 Recap ---
h100_fp16_tflops_str = fmt(H100_FLOPS_FP16_TENSOR.m_as(TFLOPs/second), precision=0, commas=False)
h100_fp8_tflops_str = fmt(H100_FLOPS_FP8_TENSOR.m_as(TFLOPs/second), precision=0, commas=True)
h100_bw_tb_str = fmt(H100_MEM_BW.m_as(TB/second), precision=2, commas=False)
h100_cap_gb_str = fmt(H100_MEM_CAPACITY.m_as(GB), precision=0, commas=False)
h100_tdp_w_str = fmt(H100_TDP.m_as(watt), precision=0, commas=False)
# --- Cluster Scale References ---
CLUSTER_SMALL_GPUS_val, CLUSTER_SMALL_GPUS_unit = fmt_plain(constants.CLUSTER_SMALL_GPUS, "GPUs")
CLUSTER_MEDIUM_GPUS_val, CLUSTER_MEDIUM_GPUS_unit = fmt_plain(constants.CLUSTER_MEDIUM_GPUS, "GPUs")
@@ -212,11 +219,11 @@ The distributed systems reasoning in this book builds upon the single-machine pe
| **Tier** | **Specification** | **Reference Value** |
|:---------------|:------------------|:---------------------------------|
| **Compute** | FP16 Throughput | `{python} H100_FLOPS_FP16_TENSOR.m_as(TFLOPs/second):.0f` TFLOPS |
| **Compute** | FP8 Throughput | `{python} H100_FLOPS_FP8_TENSOR.m_as(TFLOPs/second):,.0f` TFLOPS |
| **Memory** | HBM3 Bandwidth | `{python} H100_MEM_BW.m_as(TB/second):.2f` TB/s |
| **Memory** | HBM3 Capacity | `{python} H100_MEM_CAPACITY.m_as(GB):.0f` GB |
| **Thermal** | TDP | `{python} H100_TDP.m_as(watt):.0f` W |
| **Compute** | FP16 Throughput | `{python} h100_fp16_tflops_str` TFLOPS |
| **Compute** | FP8 Throughput | `{python} h100_fp8_tflops_str` TFLOPS |
| **Memory** | HBM3 Bandwidth | `{python} h100_bw_tb_str` TB/s |
| **Memory** | HBM3 Capacity | `{python} h100_cap_gb_str` GB |
| **Thermal** | TDP | `{python} h100_tdp_w_str` W |
: **Single-Node Foundational Constants**. Recapping the hardware specifications for the H100 accelerator. These values provide the $R_{\text{peak}}$ and $BW$ baselines used in the Iron Law calculations throughout this volume. {#tbl-fleet-foundation-recap}

View File

@@ -52,7 +52,7 @@ from mlsys.constants import (
GPU_MTTF_HOURS, GPUS_PER_HOST,
INFINIBAND_NDR_BW_GBS, NVLINK_H100_BW
)
from mlsys.formatting import fmt, check, md, md_math
from mlsys.formatting import fmt_percent, fmt, check, md, md_math
from mlsys.formulas import calc_effective_flops
class C3Taxonomy:
@@ -120,50 +120,50 @@ class C3Taxonomy:
# Case 1
case1_n_gpus_str = fmt(case1_n_gpus, precision=0)
case1_mfu_pct_str = fmt(case1_mfu * 100, precision=0, commas=False)
case1_scaling_eff_pct_str = fmt(case1_scaling_eff * 100, precision=0, commas=False)
case1_target_mfu_pct_str = fmt(MFU_TRAINING_HIGH * 100, precision=0, commas=False)
case1_mfu_pct_str = fmt_percent(case1_mfu, precision=0, commas=False)
case1_scaling_eff_pct_str = fmt_percent(case1_scaling_eff, precision=0, commas=False)
case1_target_mfu_pct_str = fmt_percent(MFU_TRAINING_HIGH, precision=0, commas=False)
case1_throughput_ratio_str = fmt(case1_throughput_ratio, precision=2, commas=False)
case1_wasted_pct_str = fmt(case1_wasted_pct, precision=0, commas=False)
# Case 2
case2_n_gpus_str = fmt(case2_n_gpus, precision=0)
case2_mfu_pct_str = fmt(case2_mfu * 100, precision=0, commas=False)
case2_comm_pct_str = fmt(case2_comm_fraction * 100, precision=0, commas=False)
case2_compute_pct_str = fmt(case2_compute_fraction * 100, precision=0, commas=False)
case2_mfu_pct_str = fmt_percent(case2_mfu, precision=0, commas=False)
case2_comm_pct_str = fmt_percent(case2_comm_fraction, precision=0, commas=False)
case2_compute_pct_str = fmt_percent(case2_compute_fraction, precision=0, commas=False)
case2_speedup_str = fmt(case2_speedup_if_fixed, precision=1, commas=False)
# Case 3
case3_n_gpus_str = fmt(case3_n_gpus, precision=0)
case3_mfu_pct_str = fmt(case3_mfu * 100, precision=0, commas=False)
case3_comm_pct_str = fmt(case3_comm_fraction * 100, precision=0, commas=False)
case3_goodput_pct_str = fmt(case3_goodput_ratio * 100, precision=0, commas=False)
case3_coord_pct_str = fmt(case3_coord_fraction * 100, precision=0, commas=False)
case3_mfu_pct_str = fmt_percent(case3_mfu, precision=0, commas=False)
case3_comm_pct_str = fmt_percent(case3_comm_fraction, precision=0, commas=False)
case3_goodput_pct_str = fmt_percent(case3_goodput_ratio, precision=0, commas=False)
case3_coord_pct_str = fmt_percent(case3_coord_fraction, precision=0, commas=False)
# Effective FLOPS
peak_pflops_str = fmt(peak_pflops, precision=0)
effective_pflops_str = fmt(effective_pflops, precision=0)
eff_fraction_pct_str = fmt(eff_fraction * 100, precision=1, commas=False)
eff_fraction_pct_str = fmt_percent(eff_fraction, precision=1, commas=False)
c3_tax_str = fmt(c3_tax, precision=1, commas=False)
mfu_pct_str = fmt(MFU_TRAINING_HIGH * 100, precision=0, commas=False)
scaling_pct_str = fmt(SCALING_EFF_8192GPU * 100, precision=0, commas=False)
goodput_pct_str = fmt(goodput_all * 100, precision=0, commas=False)
mfu_pct_str = fmt_percent(MFU_TRAINING_HIGH, precision=0, commas=False)
scaling_pct_str = fmt_percent(SCALING_EFF_8192GPU, precision=0, commas=False)
goodput_pct_str = fmt_percent(goodput_all, precision=0, commas=False)
# Overhead constants
oh_pipeline_str = fmt(OVERHEAD_PIPELINE_BUBBLE * 100, precision=0, commas=False)
oh_checkpoint_str = fmt(OVERHEAD_CHECKPOINT * 100, precision=0, commas=False)
oh_failure_str = fmt(OVERHEAD_FAILURE_RECOVERY * 100, precision=0, commas=False)
oh_maintenance_str = fmt(OVERHEAD_MAINTENANCE * 100, precision=0, commas=False)
oh_pipeline_str = fmt_percent(OVERHEAD_PIPELINE_BUBBLE, precision=0, commas=False)
oh_checkpoint_str = fmt_percent(OVERHEAD_CHECKPOINT, precision=0, commas=False)
oh_failure_str = fmt_percent(OVERHEAD_FAILURE_RECOVERY, precision=0, commas=False)
oh_maintenance_str = fmt_percent(OVERHEAD_MAINTENANCE, precision=0, commas=False)
# Scaling efficiency constants
eff_32_str = fmt(SCALING_EFF_32GPU * 100, precision=0, commas=False)
eff_256_str = fmt(SCALING_EFF_256GPU * 100, precision=0, commas=False)
eff_1024_str = fmt(SCALING_EFF_1024GPU * 100, precision=0, commas=False)
eff_8192_str = fmt(SCALING_EFF_8192GPU * 100, precision=0, commas=False)
eff_32_str = fmt_percent(SCALING_EFF_32GPU, precision=0, commas=False)
eff_256_str = fmt_percent(SCALING_EFF_256GPU, precision=0, commas=False)
eff_1024_str = fmt_percent(SCALING_EFF_1024GPU, precision=0, commas=False)
eff_8192_str = fmt_percent(SCALING_EFF_8192GPU, precision=0, commas=False)
# MFU constants
mfu_low_str = fmt(MFU_TRAINING_LOW * 100, precision=0, commas=False)
mfu_high_str = fmt(MFU_TRAINING_HIGH * 100, precision=0, commas=False)
mfu_low_str = fmt_percent(MFU_TRAINING_LOW, precision=0, commas=False)
mfu_high_str = fmt_percent(MFU_TRAINING_HIGH, precision=0, commas=False)
# ┌── EXPORTS (Bridge to Text) ─────────────────────────────────────────────────
C3 = C3Taxonomy
@@ -496,7 +496,7 @@ The fleet spends exactly half its time on useful computation. Breaking down the
# └─────────────────────────────────────────────────────────────────────────────
eff_frac_val = MFU_TRAINING_HIGH * SCALING_EFF_1024GPU * 0.85
eff_frac_pct_str = fmt(eff_frac_val * 100, precision=1, commas=False)
eff_frac_pct_str = fmt_percent(eff_frac_val, precision=1, commas=False)
c3_tax_val = 1 / eff_frac_val
c3_tax_factor_str = fmt(c3_tax_val, precision=1, commas=False)
eff_frac_3dp_str = fmt(eff_frac_val, precision=3, commas=False)

View File

@@ -277,7 +277,7 @@ _eff_8192 = fmt(FF.eff_8192, precision=0, commas=False)
_mtbf_8k_min = fmt(FF.mtbf_8192_min, precision=0, commas=False)
_mtbf_100k_min = fmt(FF.mtbf_100k_min, precision=0, commas=False)
_ckpt_175 = fmt(FF.ckpt_175b_gb, precision=0, commas=False)
_goodput = fmt(FF.goodput_ratio * 100, precision=0, commas=False)
_goodput = fmt_percent(FF.goodput_ratio, precision=0, commas=False)
_mfu_lo = fmt(FF.mfu_low, precision=0, commas=False)
_mfu_hi = fmt(FF.mfu_high, precision=0, commas=False)
_rack_ai = fmt(FF.rack_ai, precision=0, commas=False)
@@ -501,7 +501,7 @@ At fleet scale, four categories of overhead consume wall-clock time that is not
| **Failure recovery** | ~`{python} FF.oh_failure`% | Faster detection, elastic rescheduling |
| **Maintenance windows** | ~`{python} FF.oh_maintenance`% | Rolling upgrades, live migration |
: **Overhead Budgets for Fleet-Scale Training**: These are fractions of wall-clock time. At 10,000+ GPUs, failure recovery dominates. The compound effect is multiplicative: total goodput ratio $\approx (1 - 0.05)(1 - 0.03)(1 - 0.10)(1 - 0.05) \approx$ `{python} fmt(FF.goodput_ratio * 100, precision=0)`%. {#tbl-fleet-overhead-budgets}
: **Overhead Budgets for Fleet-Scale Training**: These are fractions of wall-clock time. At 10,000+ GPUs, failure recovery dominates. The compound effect is multiplicative: total goodput ratio $\approx (1 - 0.05)(1 - 0.03)(1 - 0.10)(1 - 0.05) \approx$ `{python} fmt_percent(FF.goodput_ratio, precision=0)`%. {#tbl-fleet-overhead-budgets}
#### Power and Sustainability Numbers {.unnumbered}
@@ -772,9 +772,9 @@ peak_str = fmt(FF.peak_1024, precision=0)
eff_str = fmt(FF.eff_flops_1024, precision=0)
# Effective % of peak (0.50 × 0.50 × 0.77 ≈ 19.25%); use fmt_percent to avoid display bugs
eff_pct_str = fmt_percent(FF.eff_fraction, precision=1)
goodput_pct_str = fmt(FF.goodput_ratio * 100, precision=0, commas=False)
mfu_pct_str = fmt(MFU_TRAINING_HIGH * 100, precision=0, commas=False)
scaling_pct_str = fmt(SCALING_EFF_1024GPU * 100, precision=0, commas=False)
goodput_pct_str = fmt_percent(FF.goodput_ratio, precision=0, commas=False)
mfu_pct_str = fmt_percent(MFU_TRAINING_HIGH, precision=0, commas=False)
scaling_pct_str = fmt_percent(SCALING_EFF_1024GPU, precision=0, commas=False)
mfu_fmt = fmt(MFU_TRAINING_HIGH, precision=2, commas=False)
scaling_fmt = fmt(SCALING_EFF_1024GPU, precision=2, commas=False)
@@ -793,7 +793,7 @@ $$\text{Effective} = \text{Peak} \times \text{MFU} \times \eta_{\text{scaling}}
`{python} effective_eq_math`
The cluster delivers `{python} eff_pct_str`% of its peak FLOPS as useful training work. The remaining `{python} fmt(100 - FF.eff_fraction * 100, precision=0)`% is consumed by hardware underutilization (`{python} mfu_pct_str`% MFU), communication overhead (`{python} scaling_pct_str`% scaling efficiency), and operational losses (`{python} goodput_pct_str`% goodput ratio).
The cluster delivers `{python} eff_pct_str`% of its peak FLOPS as useful training work. The remaining `{python} fmt_percent(1 - FF.eff_fraction, precision=0)`% is consumed by hardware underutilization (`{python} mfu_pct_str`% MFU), communication overhead (`{python} scaling_pct_str`% scaling efficiency), and operational losses (`{python} goodput_pct_str`% goodput ratio).
This is not a failure of engineering---it is the physics of fleet-scale computation. Every additional GPU adds less marginal useful work, but the total throughput still far exceeds what a smaller cluster could achieve. The goal is not to reach 100% utilization; the goal is to deliver trained models faster than any smaller configuration could.

View File

@@ -23,7 +23,7 @@ from mlsys.constants import (
GPT3_PARAMS, GB, second, Mparam, THOUSAND,
SEC_PER_HOUR, SEC_PER_DAY, MILLION, TRILLION, BITS_PER_BYTE, TB
)
from mlsys.formatting import fmt, sci, check
from mlsys.formatting import fmt_percent, fmt, sci, check
start_chapter("vol2:distributed_training")
```
@@ -96,7 +96,7 @@ from mlsys.constants import (
GB, second, Mparam, Tparam, THOUSAND,
SEC_PER_HOUR, SEC_PER_DAY, MILLION, TRILLION, BITS_PER_BYTE, TB
)
from mlsys.formatting import fmt, sci, check
from mlsys.formatting import fmt_percent, fmt, sci, check
# ┌── LEGO ───────────────────────────────────────────────
## Why Distribution Is Necessary {#sec-distributed-training-systems-systems-multimachine-scaling-fundamentals-ff96}```
@@ -681,7 +681,7 @@ Modern distributed training frameworks handle this distribution automatically th
# └─────────────────────────────────────────────────────────────────────────────
from mlsys import Models, Applications
from mlsys.constants import param, BILLION
from mlsys.formatting import fmt
from mlsys.formatting import fmt_percent, fmt
class FrontierTrainingContext:
"""GPT-3 scale reference for distributed training."""
@@ -721,7 +721,7 @@ frontier_name = FrontierTrainingContext.frontier_name
# └─────────────────────────────────────────────────────────────────────────────
from mlsys import Models, Applications
from mlsys.constants import param, BILLION
from mlsys.formatting import fmt
from mlsys.formatting import fmt_percent, fmt
class FrontierTrainingContext:
"""GPT-3 scale reference for distributed training."""
@@ -959,7 +959,7 @@ from mlsys.constants import (
NVLINK_H100_BW, INFINIBAND_HDR_BW, MILLION, BILLION, GB, byte, second,
GPUS_PER_HOST, BITS_PER_BYTE
)
from mlsys.formatting import fmt, check
from mlsys.formatting import fmt_percent, fmt, check
# ┌── LEGO ───────────────────────────────────────────────
class Scaling8GPU:
@@ -2779,7 +2779,7 @@ class YoungDaly:
# ┌── 4. OUTPUT (Formatting) ──────────────────────────────────────────────
t_opt_min_str = fmt(t_opt_min_val, precision=0)
loss_pct_str = fmt(total_overhead * 100, precision=1)
loss_pct_str = fmt_percent(total_overhead, precision=1)
daily_savings_str = fmt(diff_daily, precision=0, commas=True)
# ┌── EXPORTS (Bridge to Text) ─────────────────────────────────────────────────