fix: resolve Vol 2 PDF build failures and Pint unit display bugs

- Add missing attributes to FleetFoundations in appendix_fleet.qmd - Fix regression_testing.png image path in fault_tolerance.qmd - Add pgfplots package to header-includes.tex for TikZ compatibility - Fortify fmt_percent in formatting.py to handle Pint Quantities properly, fixing the 19250000000000% display bug
2026-03-11 17:49:25 -05:00 · 2026-02-26 20:46:12 -05:00
parent baebb4c6d7
commit 96336ab0c6
4 changed files with 838 additions and 457 deletions
--- a/book/quarto/contents/vol2/backmatter/appendix_fleet.qmd
+++ b/book/quarto/contents/vol2/backmatter/appendix_fleet.qmd
@@ -31,7 +31,7 @@ This appendix collects the reference numbers and compact models for fleet-scale

 import math
 from mlsys.constants import *
-from mlsys.formatting import fmt, check, md, md_math, sci_latex
+from mlsys.formatting import fmt, fmt_percent, check, md, md_math, sci_latex
 from mlsys.formulas import (
    calc_mtbf_cluster, calc_mtbf_node,
    calc_failure_probability, calc_effective_flops,
@@ -62,6 +62,10 @@ class FleetFoundations:

    # Level 3: Cluster
    cluster_sizes = [CLUSTER_SMALL_GPUS, CLUSTER_MEDIUM_GPUS, CLUSTER_LARGE_GPUS, CLUSTER_MEGA_GPUS]
+    cl_small = CLUSTER_SMALL_GPUS
+    cl_medium = CLUSTER_MEDIUM_GPUS
+    cl_large = CLUSTER_LARGE_GPUS
+    cl_mega = CLUSTER_MEGA_GPUS  # for prose: "At FF.cl_mega GPUs"

    # Efficiency & Budgets
    mfu_range = (MFU_TRAINING_LOW, MFU_TRAINING_HIGH)
@@ -70,6 +74,12 @@ class FleetFoundations:
    oh_failure = OVERHEAD_FAILURE_RECOVERY
    oh_maintenance = OVERHEAD_MAINTENANCE

+    # Scaling efficiency percentages (for prose: ~FF.eff_1024%)
+    eff_32 = int(SCALING_EFF_32GPU * 100)
+    eff_256 = int(SCALING_EFF_256GPU * 100)
+    eff_1024 = int(SCALING_EFF_1024GPU * 100)
+    eff_8192 = int(SCALING_EFF_8192GPU * 100)
+
    # ┌── 2. EXECUTE (The Compute) ────────────────────────────────────────
    # Step 1: Level 4: Ratios & Rework
    nvlink_h100_bw_val = int(nvlink_h100_qty.m_as(GB / second))
@@ -81,10 +91,18 @@ class FleetFoundations:
        node_mtbf_params["psu_mttf"], 4
    )

+    mtbf_256_h = calc_mtbf_cluster(GPU_MTTF_HOURS, 256)
+    mtbf_2048_h = calc_mtbf_cluster(GPU_MTTF_HOURS, 2048)
    mtbf_8192_h = calc_mtbf_cluster(GPU_MTTF_HOURS, 8192)
    mtbf_100k_h = calc_mtbf_cluster(GPU_MTTF_HOURS, CLUSTER_MEGA_GPUS)
+    mtbf_256_min = mtbf_256_h.m_as(ureg.minute)
+    mtbf_2048_min = mtbf_2048_h.m_as(ureg.minute)
    mtbf_8192_min = mtbf_8192_h.m_as(ureg.minute)
    mtbf_100k_min = mtbf_100k_h.m_as(ureg.minute)
+    pfail_256_24h = calc_failure_probability(mtbf_256_h, 24 * ureg.hour)
+    pfail_2048_24h = calc_failure_probability(mtbf_2048_h, 24 * ureg.hour)
+    pfail_8192_24h = calc_failure_probability(mtbf_8192_h, 24 * ureg.hour)
+    pfail_100k_24h = calc_failure_probability(mtbf_100k_h, 24 * ureg.hour)

    # Step 2: Effective FLOPS (1024-GPU cluster)
    _peak_1024_qty = 1024 * h100_qty
@@ -95,7 +113,8 @@ class FleetFoundations:
        SCALING_EFF_1024GPU,
        goodput_ratio
    )
-    eff_fraction = _eff_flops_1024_qty / _peak_1024_qty
+    # Store as plain float so prose/formatters never see a Quantity (avoids display bugs)
+    eff_fraction = float((_eff_flops_1024_qty / _peak_1024_qty).m_as(''))

    # ┌── 3. GUARD (Invariants) ──────────────────────────────────────────
    check(nvlink_to_ib > 10, "NVLink must be >10x IB for hierarchy to hold")
@@ -115,6 +134,49 @@ class FleetFoundations:
    oh_failure_pct = int(oh_failure * 100)
    oh_maintenance_pct = int(oh_maintenance * 100)

+    # Interconnect scalars (for tables/prose)
+    nvlink_h100_bw = int(nvlink_h100_qty.m_as(GB / second))
+    pcie5_bw = int(pcie5_qty.m_as(GB / second))
+    ib_hdr_bw = INFINIBAND_HDR_BW_GBS
+    ib_xdr_bw = INFINIBAND_XDR_BW_GBS
+    roce_bw = ROCE_100G_BW_GBS
+    eth_400g_bw = ETHERNET_400G_BW_GBS
+    tpuv5_ici = int(TPUV5P_ICI_BW.m_as(GB / second))
+    ib_hdr_lat = IB_HDR_LATENCY_US
+    roce_lat = ROCE_LATENCY_US
+    tcp_lat = TCP_LATENCY_US
+    ib_to_tcp_lat = int(TCP_LATENCY_US / IB_NDR_LATENCY_US)
+
+    # Checkpoint sizes (GB or TB for display)
+    ckpt_7b_gb = calc_checkpoint_size(7e9, 16).m_as(GB)
+    ckpt_70b_gb = calc_checkpoint_size(70e9, 16).m_as(GB)
+    ckpt_175b_gb = calc_checkpoint_size(175e9, 16).m_as(GB)
+    ckpt_1t_tb = calc_checkpoint_size(1e12, 16).m_as(TB)
+
+    # Rack & PUE
+    rack_trad = RACK_POWER_TRADITIONAL_KW
+    rack_ai = RACK_POWER_AI_TYPICAL_KW
+    rack_ai_high = RACK_POWER_AI_HIGH_KW
+    air_limit = AIR_COOLING_LIMIT_KW
+    pue_liquid = PUE_LIQUID_COOLED
+    pue_air = PUE_BEST_AIR
+    pue_typical = PUE_TYPICAL
+    pue_legacy = PUE_LEGACY
+    rack_ratio = rack_ai / rack_trad
+
+    # Accelerator table (scalars for fmt)
+    h100_flops = int(H100_FLOPS_FP16_TENSOR.m_as(TFLOPs / second))
+    h100_bw_tbs = fmt(H100_MEM_BW.m_as(TB / second), precision=2, commas=False)
+    h100_cap = int(H100_MEM_CAPACITY.m_as(GiB))
+    h100_tdp = int(H100_TDP.m_as(watt))
+    b200_flops = int(B200_FLOPS_FP16_TENSOR.m_as(TFLOPs / second))
+    b200_bw_tbs = fmt(B200_MEM_BW.m_as(TB / second), precision=2, commas=False)
+    b200_cap = int(B200_MEM_CAPACITY.m_as(GiB))
+    b200_tdp = int(B200_TDP.m_as(watt))
+    tpuv5_flops = int(TPUV5P_FLOPS_BF16.m_as(TFLOPs / second))
+    tpuv5_bw_tbs = fmt(TPUV5P_MEM_BW.m_as(TB / second), precision=2, commas=False)
+    tpuv5_cap = int(TPUV5P_MEM_CAPACITY.m_as(GiB))
+
 # ── EXPORTS (Bridge to Text) ────────────────────────────────────────────────
 FF = FleetFoundations

@@ -198,7 +260,7 @@ If you memorize nothing else from this section, memorize these:

 2. **MTBF scales as $1/N$**: A cluster's mean time between failures is the single-component MTBF divided by the number of components. At `{python} fmt(FF.cl_mega, precision=0)` GPUs, expect a failure every `{python} fmt(FF.mtbf_100k_min, precision=0)` minutes.

-3. **~`{python} fmt(FF.eff_fraction * 100, precision=0)`% effective utilization**: After MFU, scaling efficiency, and overhead losses compound, a 1,024-GPU cluster delivers roughly `{python} fmt(FF.eff_fraction * 100, precision=0)`% of its peak FLOPS as useful training work.
+3. **~`{python} fmt_percent(FF.eff_fraction)`% effective utilization**: After MFU, scaling efficiency, and overhead losses compound, a 1,024-GPU cluster delivers roughly `{python} fmt_percent(FF.eff_fraction)`% of its peak FLOPS as useful training work.
 :::

 Quick reference --- @tbl-fleet-numbers-quick-ref condenses the numbers below into one place. Use it for back-of-envelope checks; use the detailed tables in each subsection when designing or debugging.
@@ -708,7 +770,8 @@ The key insight for fleet-scale ML is that weak scaling is not just a mathematic

 peak_str = fmt(FF.peak_1024, precision=0)
 eff_str = fmt(FF.eff_flops_1024, precision=0)
-eff_pct_str = fmt(FF.eff_fraction * 100, precision=0, commas=False)
+# Effective % of peak (0.50 × 0.50 × 0.77 ≈ 19.25%); use fmt_percent to avoid display bugs
+eff_pct_str = fmt_percent(FF.eff_fraction, precision=1)
 goodput_pct_str = fmt(FF.goodput_ratio * 100, precision=0, commas=False)
 mfu_pct_str = fmt(MFU_TRAINING_HIGH * 100, precision=0, commas=False)
 scaling_pct_str = fmt(SCALING_EFF_1024GPU * 100, precision=0, commas=False)
--- a/book/quarto/contents/vol2/fault_tolerance/fault_tolerance.qmd
+++ b/book/quarto/contents/vol2/fault_tolerance/fault_tolerance.qmd
--- a/book/quarto/mlsys/formatting.py
+++ b/book/quarto/mlsys/formatting.py
@@ -58,6 +58,21 @@ def fmt(quantity, unit=None, precision=1, commas=True, allow_zero=False):
    return result


+def fmt_percent(ratio, precision=1, commas=False):
+    """
+    Format a ratio (0.0 to 1.0) as a percentage string for display.
+    Use this for compound fractions (e.g. effective utilization) to avoid
+    display bugs from Quantity or wrong scaling.
+    Accepts Pint Quantity (uses magnitude) or plain float.
+    """
+    if isinstance(ratio, ureg.Quantity):
+        # Crucial: convert to dimensionless first so units like flop/TFLOP cancel out!
+        ratio = float(ratio.m_as(''))
+    else:
+        ratio = float(ratio)
+    return fmt(ratio * 100, precision=precision, commas=commas)
+
+
 def sci(val, precision=2):
    """
    Formats a number or Pint Quantity into scientific notation using Unicode.
@@ -92,6 +107,11 @@ def display_percent(ratio, precision=0):
    """
    ratio: 0.0 to 1.0
    """
+    if isinstance(ratio, ureg.Quantity):
+        ratio = float(ratio.m_as(''))
+    else:
+        ratio = float(ratio)
+        
    pct = ratio * 100
    return {
        "value": ratio,
--- a/book/quarto/tex/header-includes.tex
+++ b/book/quarto/tex/header-includes.tex
@@ -80,6 +80,8 @@ labelformat=mylabel,justification=raggedright,singlelinecheck=false,font={ninept
 % Colors and visual elements
 \usepackage[dvipsnames]{xcolor}  % Extended color support
 \usepackage{tikz}           % Programmatic graphics
+\usepackage{pgfplots}       % Axis plots in TikZ (e.g. fault_tolerance bathtub curve)
+\pgfplotsset{compat=1.18}
 \usetikzlibrary{angles}
 \usetikzlibrary{arrows.meta}
 \usetikzlibrary{arrows}
@@ -440,6 +442,22 @@ aboveskip=0pt
 \definecolor{BlueDD}{RGB}{62,100,125}
 \colorlet{BlueDD}{magenta}

+% Diagram colors (used by inline TikZ in chapters; also in diagram.yml for SVG)
+\definecolor{BlueLine}{HTML}{006395}
+\definecolor{BlueL}{RGB}{209,243,255}
+\definecolor{GreenLine}{HTML}{008F45}
+\definecolor{GreenL}{RGB}{219,253,166}
+\definecolor{OrangeLine}{HTML}{E67817}
+\definecolor{OrangeL}{RGB}{250,212,175}
+\definecolor{RedLine}{HTML}{D9534F}
+\definecolor{RedL}{RGB}{253,226,240}
+\definecolor{GrayLine}{HTML}{666666}
+\definecolor{GrayL}{HTML}{E0E0E0}
+\definecolor{VioletLine}{HTML}{7E317B}
+\definecolor{VioletL}{RGB}{247,180,247}
+\definecolor{BrownLine}{RGB}{143,120,116}
+\definecolor{BrownL}{RGB}{233,222,220}
+
 % ===============================================================================
 % PART STYLING SYSTEM
 % ===============================================================================