refactor: use fmt_percent across Vol 1 and Vol 2 to prevent Pint precision bugs

This commit standardizes percentage formatting across the entire codebase to prevent critical rendering bugs (like the `19250000000000%` effective utilization bug in Vol 2). Root Cause: When dividing two Pint Quantities (e.g., `flop/second` by `TFLOPs/second`), Pint creates a mixed unit (`flop/TFLOPs`). The raw `.magnitude` of this fraction is $10^{12}$. When passed to `fmt(x * 100)`, it multiplied that massive magnitude by 100, resulting in an incorrect display. Fix: 1. Fortified `fmt_percent` and `display_percent` in `mlsys/formatting.py` to defensively strip units using `.m_as('')`. This forces Pint to cancel out the units (e.g., `flop/TFLOPs` becomes `1.0`) *before* extracting the number. 2. Replaced all instances of `fmt(X * 100)` with the fortified `fmt_percent(X)` across Vol 1 and Vol 2. 3. Fixed inline f-strings in `appendix_assumptions.qmd` by moving formatting logic into the Python setup cell as `_str` variables, adhering to the book's standard practice. Validation: - Audited all `.magnitude` extractions in the codebase to ensure they are safe (e.g., explicitly converting to dimensionless units first). - Ran `validate_inline_refs.py` and confirmed no Python variables are trapped inside LaTeX math mode. - Successfully built full PDFs for both Volume 1 and Volume 2.
2026-03-08 23:03:55 -05:00 · 2026-02-26 20:54:12 -05:00
parent 96336ab0c6
commit 303cd26669
11 changed files with 172 additions and 165 deletions
--- a/book/quarto/contents/vol1/backmatter/appendix_data.qmd
+++ b/book/quarto/contents/vol1/backmatter/appendix_data.qmd
@@ -43,7 +43,7 @@ from mlsys.constants import (
    BITS_PER_BYTE, SECONDS_PER_MINUTE, SEC_PER_HOUR, SEC_PER_DAY,
    KIB_TO_BYTES, MILLION, DAYS_PER_MONTH
 )
-from mlsys.formatting import fmt, check
+from mlsys.formatting import fmt_percent, fmt, check

 # ┌── LEGO ───────────────────────────────────────────────
 # Scenarios: Data Gravity, Serialization, Data Algebra, Long Tails
@@ -259,7 +259,7 @@ In systems, the mean is often misleading. Latency distributions are almost alway
 ```{python}
 #| label: tail-latency-probability
 #| echo: false
-from mlsys.formatting import fmt, md_math
+from mlsys.formatting import fmt_percent, fmt, md_math

 # =============================================================================
 # PURPOSE
@@ -283,7 +283,7 @@ p_slow_value = 1 - p_all_fast_value
 # OUTPUT
 # =============================================================================
 p_all_fast_str = fmt(p_all_fast_value, precision=3, commas=False)
-p_slow_pct_str = fmt(p_slow_value * 100, precision=1, commas=False)
+p_slow_pct_str = fmt_percent(p_slow_value, precision=1, commas=False)
 p_slow_eq = md_math(f"P(\\text{{Slow}}) = 1 - ({p_fast_value})^{{{n_requests_value}}} \\approx 1 - {p_all_fast_value:.3f} = {p_slow_pct_str}\\%")
 ```

--- a/book/quarto/contents/vol1/benchmarking/benchmarking.qmd
+++ b/book/quarto/contents/vol1/benchmarking/benchmarking.qmd
@@ -93,7 +93,7 @@ Every preceding chapter introduced decisions with measurable consequences: which

 from mlsys import Hardware, Models
 from mlsys.constants import *
-from mlsys.formatting import fmt, sci
+from mlsys.formatting import fmt_percent, fmt, sci

 class BenchmarkingSetup:
    """Chapter-wide hardware and model constants for all benchmarking sections and callouts."""
@@ -557,7 +557,7 @@ Effective benchmark interpretation requires knowing the performance characterist
 # └─────────────────────────────────────────────────────────────────────────────
 from mlsys.constants import A100_MEM_BW, A100_FLOPS_FP16_TENSOR, TB, TFLOPs, second
 from mlsys.constants import BILLION, MILLION
-from mlsys.formatting import fmt, check
+from mlsys.formatting import fmt_percent, fmt, check

 # ┌── LEGO ───────────────────────────────────────────────
 class RooflineExamples:
@@ -611,7 +611,7 @@ class RooflineExamples:
    bert_ai_b1_str = fmt(bert_ai_b1, precision=0, commas=False)
    bert_perf_b1_str = fmt(bert_perf_b1, precision=0, commas=False)
    bert_util_b1_str = fmt(bert_util_b1, precision=0, commas=False)
-    utilization_peak_pct_str = fmt(bert_util_peak * 100, precision=0, commas=False)
+    utilization_peak_pct_str = fmt_percent(bert_util_peak, precision=0, commas=False)

 # ┌── EXPORTS (Bridge to Text) ─────────────────────────────────────────────────
 a100_tflops_fp16_str = RooflineExamples.a100_tflops_fp16_str
@@ -670,7 +670,7 @@ from mlsys.constants import (
    A100_FLOPS_FP16_TENSOR, TFLOPs, second, BILLION, MILLION,
    Mparam, Bparam, BYTES_FP32, MB, TB
 )
-from mlsys.formatting import fmt, check
+from mlsys.formatting import fmt_percent, fmt, check

 # ┌── LEGO ───────────────────────────────────────────────
 class BertRoofline:
@@ -739,7 +739,7 @@ class BertRoofline:

    batch32_str = str(batch_32)
    utilization_peak_str = f"{util_peak}"
-    utilization_peak_pct_str = fmt(util_peak * 100, precision=0, commas=False)
+    utilization_peak_pct_str = fmt_percent(util_peak, precision=0, commas=False)

    # Re-export A100 constants for this cell context
    a100_tflops_fp16_str = fmt(peak_flops, precision=0, commas=False)
@@ -1907,7 +1907,7 @@ A key metric for evaluating parallelism is *scaling efficiency*, which quantifie
 # │ Exports: ideal_str, eff_str, loss_str, eff_denom_str, t1_hours_str,
 # │          n_gpus_str, tn_hours_str, scaling_eq_str
 # └─────────────────────────────────────────────────────────────────────────────
-from mlsys.formatting import fmt, check
+from mlsys.formatting import fmt_percent, fmt, check

 class ScalingEfficiencyCalc:
    """Strong scaling efficiency for 8-GPU ResNet-50 training: 75% efficiency, 25% overhead loss."""
@@ -2014,7 +2014,7 @@ Training large-scale machine learning models requires substantial computational
 # │          e_int8_total_str, s_load_str, s_compute_str, s_total_str,
 # │          e_fp32_load_mj_str, e_fp32_compute_mj_str
 # └─────────────────────────────────────────────────────────────────────────────
-from mlsys.formatting import fmt, check
+from mlsys.formatting import fmt_percent, fmt, check
 from mlsys.constants import ENERGY_DRAM_PJ_PER_BYTE, ENERGY_FLOP_FP32_PJ, ENERGY_FLOP_INT8_PJ

 class EnergyBreakdownCalc:
@@ -2360,7 +2360,7 @@ These component-level contributions explain why optimizing any single stage yiel
 # │          bench_opt_total_str, bench_opt_inf_str, amdahl_ceiling_str,
 # │          preprocess_pct_str, preprocess_fraction_str
 # └─────────────────────────────────────────────────────────────────────────────
-from mlsys.formatting import fmt, check
+from mlsys.formatting import fmt_percent, fmt, check

 class AmdahlBenchmarkCalc:
    """Amdahl ceiling: 5× inference speedup → only 1.8× end-to-end when preprocessing dominates."""
@@ -2384,7 +2384,7 @@ class AmdahlBenchmarkCalc:
    bench_opt_total_str = fmt(bench_opt_total_ms, precision=0, commas=False)
    bench_opt_inf_str = fmt(bench_opt_inference_ms, precision=0, commas=False)
    amdahl_ceiling_str = fmt(amdahl_ceiling, precision=2, commas=False)
-    preprocess_pct_str = fmt(preprocess_fraction * 100, precision=0, commas=False)
+    preprocess_pct_str = fmt_percent(preprocess_fraction, precision=0, commas=False)
    preprocess_fraction_str = fmt(preprocess_fraction, precision=2, commas=False)

 # ┌── EXPORTS (Bridge to Text) ─────────────────────────────────────────────────
@@ -2640,7 +2640,7 @@ These scenarios explain why the same hardware can report dramatically different
 # │          inference_speedup_str, e2e_speedup_str, edgetpu_power_ratio_str,
 # │          cpu_energy_mj_str, edgetpu_energy_mj_str, energy_ratio_str
 # └─────────────────────────────────────────────────────────────────────────────
-from mlsys.formatting import fmt, check
+from mlsys.formatting import fmt_percent, fmt, check

 class EdgeTPUSpeedupCalc:
    """EdgeTPU vs Cortex-M7: 7.5× inference speedup, higher peak power, lower energy per inference."""
@@ -3803,7 +3803,7 @@ Acceptable degradation depends on deployment context. A 2% accuracy drop might b
 # │          mv2_ece_int8_str, mv2_edge_fp32_str, mv2_edge_int8_str,
 # │          mv2_edge_drop_str
 # └─────────────────────────────────────────────────────────────────────────────
-from mlsys.formatting import fmt, check
+from mlsys.formatting import fmt_percent, fmt, check

 class MobileNetINT8Calc:
    """MobileNetV2 FP32 vs INT8: aggregate accuracy holds but calibration and edge cases degrade."""
@@ -3927,7 +3927,7 @@ Generation-specific metrics capture properties absent from discriminative benchm
 # │ Exports: slow_str, fast_str, response_tokens_str, slow_toks_str,
 # │          fast_toks_str
 # └─────────────────────────────────────────────────────────────────────────────
-from mlsys.formatting import fmt, check
+from mlsys.formatting import fmt_percent, fmt, check

 class LLMThroughputCalc:
    """25 vs 100 tok/s on a 750-token response: 30 s vs 7.5 s — a 4× user-perceived difference."""
--- a/book/quarto/contents/vol1/data_selection/data_selection.qmd
+++ b/book/quarto/contents/vol1/data_selection/data_selection.qmd
@@ -81,7 +81,7 @@ For decades, the dominant strategy was straightforward: more data, better models
 # │ Imports: mlsys.formatting (fmt)
 # │ Exports: gpu_growth_str, gpu_period_str, web_data_growth_str, etc.
 # └─────────────────────────────────────────────────────────────────────────────
-from mlsys.formatting import fmt, check
+from mlsys.formatting import fmt_percent, fmt, check

 # ┌── LEGO ───────────────────────────────────────────────
 class SelectionEconomicsAnchor:
@@ -223,7 +223,7 @@ To make this concrete, consider training a model in the **GPT-2/Llama Lighthouse
 # └─────────────────────────────────────────────────────────────────────────────
 from mlsys.constants import Bparam, BILLION, TRILLION, SEC_PER_HOUR, MILLION, THOUSAND
 from mlsys import Models
-from mlsys.formatting import fmt, check
+from mlsys.formatting import fmt_percent, fmt, check

 # ┌── LEGO ───────────────────────────────────────────────
 class ComputeDataGap:
@@ -301,7 +301,7 @@ The systems framing reveals optimization opportunities invisible to the ML frami
 # │ Imports: mlsys.formatting (fmt)
 # │ Exports: training_cost_m_str, dataset_reduction_pct_str, combined_factor_str
 # └─────────────────────────────────────────────────────────────────────────────
-from mlsys.formatting import fmt, check
+from mlsys.formatting import fmt_percent, fmt, check

 # ┌── LEGO ───────────────────────────────────────────────
 class IronLawSavings:
@@ -544,7 +544,7 @@ To make the Information-Compute Ratio concrete, consider how coreset selection i
 # │ Exports: imagenet_size_str, icr_ratio_str, coreset_pct_str, etc.
 # └─────────────────────────────────────────────────────────────────────────────
 from mlsys.constants import RESNET50_FLOPs, GFLOPs, IMAGENET_IMAGES
-from mlsys.formatting import fmt, check
+from mlsys.formatting import fmt_percent, fmt, check

 class IcrCoresetComparison:
    """Compare learning-per-FLOP for random sampling vs. coreset selection."""
@@ -586,7 +586,7 @@ class IcrCoresetComparison:
    acc_gain_random_str = fmt(acc_gain_random_value, precision=1, commas=False)
    acc_gain_coreset_str = fmt(acc_gain_coreset_value, precision=1, commas=False)
    acc_diff_str = fmt(acc_diff_value, precision=1, commas=False)
-    coreset_pct_str = fmt(coreset_fraction_value * 100, precision=0, commas=False)
+    coreset_pct_str = fmt_percent(coreset_fraction_value, precision=0, commas=False)

 # ┌── EXPORTS (Bridge to Text) ─────────────────────────────────────────────────
 resnet50_fwd_gflops_str = IcrCoresetComparison.resnet50_fwd_gflops_str
@@ -671,7 +671,7 @@ Why does this heterogeneity exist? The answer lies in how neural networks learn
 # │ Imports: mlsys.formatting (fmt)
 # │ Exports: epsilon_str, epsilon_pct_str, n_clean_str, n_noisy_str, ratio_str
 # └─────────────────────────────────────────────────────────────────────────────
-from mlsys.formatting import fmt, check
+from mlsys.formatting import fmt_percent, fmt, check

 # ┌── LEGO ───────────────────────────────────────────────
 class QualityMultiplier:
@@ -697,7 +697,7 @@ class QualityMultiplier:

    # ┌── 4. OUTPUT (Formatting) ──────────────────────────────────────────────
    epsilon_str = fmt(epsilon, precision=2, commas=False)
-    epsilon_pct_str = fmt(epsilon * 100, precision=0, commas=False)
+    epsilon_pct_str = fmt_percent(epsilon, precision=0, commas=False)
    n_clean_str = fmt(n_clean, precision=0, commas=False)
    n_noisy_str = fmt(n_noisy, precision=0, commas=True)
    ratio_str = fmt(ratio, precision=0, commas=False)
@@ -875,7 +875,7 @@ Given these trade-offs, most practitioners find that EL2N with a small proxy mod
 # │ Imports: mlsys.formatting (fmt)
 # │ Exports: n_train_images_str, coreset_fraction_pct_str, n_coreset_str, etc.
 # └─────────────────────────────────────────────────────────────────────────────
-from mlsys.formatting import fmt, check
+from mlsys.formatting import fmt_percent, fmt, check

 class CoresetPractice:
    """Practical 10× coreset workflow: 5-epoch proxy selects 100K from 1M images."""
@@ -894,7 +894,7 @@ class CoresetPractice:

    # ┌── 4. OUTPUT (Formatting) ─────────────────────────────────────────────
    n_train_images_str = fmt(n_train_images_value / MILLION, precision=0) + " million"
-    coreset_fraction_pct_str = fmt(coreset_fraction_value * 100, precision=0, commas=False)
+    coreset_fraction_pct_str = fmt_percent(coreset_fraction_value, precision=0, commas=False)
    n_coreset_str = fmt(n_coreset_value, precision=0, commas=True)
    n_epochs_proxy_str = fmt(n_epochs_proxy_value, precision=0, commas=False)

@@ -1061,7 +1061,7 @@ From a systems perspective, curriculum learning improves convergence by reducing
 # │ Imports: mlsys.formatting (fmt)
 # │ Exports: cifar10_speedup_str, imagenet_speedup_str, mentornet_speedup_str
 # └─────────────────────────────────────────────────────────────────────────────
-from mlsys.formatting import fmt, check
+from mlsys.formatting import fmt_percent, fmt, check

 class CurriculumBenchmarks:
    """Curriculum learning convergence speedups across CIFAR-10, CIFAR-100, ImageNet, MentorNet."""
@@ -1214,7 +1214,7 @@ The economic implications are substantial. In production settings, labeling cost
 # │ Imports: mlsys.formatting (fmt)
 # │ Exports: n_unlabeled_str, cost_saving_str, speedup_str, etc.
 # └─────────────────────────────────────────────────────────────────────────────
-from mlsys.formatting import fmt, check
+from mlsys.formatting import fmt_percent, fmt, check

 class ActiveLearningRoi:
    """Medical imaging active learning: 20× speedup, $4.75M savings vs. naive labeling."""
@@ -1385,7 +1385,7 @@ The systems trade-off in semi-supervised learning is straightforward: it typical
 # │ Imports: mlsys.formatting (fmt)
 # │ Exports: cifar10_fixmatch_*_str, acc_loss_str, cost_reduction_str, etc.
 # └─────────────────────────────────────────────────────────────────────────────
-from mlsys.formatting import fmt, check
+from mlsys.formatting import fmt_percent, fmt, check

 class FixmatchLabelEfficiency:
    """FixMatch CIFAR-10: 200× label reduction for ~8× total cost savings."""
@@ -1551,7 +1551,7 @@ To illustrate this economic transformation, consider a company building ten spec
 # │ Imports: mlsys.formatting (fmt)
 # │ Exports: label_cost_drop_str, marginal_compute_reduction_str, etc.
 # └─────────────────────────────────────────────────────────────────────────────
-from mlsys.formatting import fmt, check
+from mlsys.formatting import fmt_percent, fmt, check

 class FoundationCostAmortization:
    """Foundation model amortization: 10 tasks, 100× label reduction, 20× marginal compute drop."""
@@ -2105,7 +2105,7 @@ Here $T_{selection}$ is the time spent scoring the pool and $T_{train}$ is the c
 # │ Imports: mlsys.formatting (fmt)
 # │ Exports: score_a_str, savings_b_pct_str, trap_pct_str, etc.
 # └─────────────────────────────────────────────────────────────────────────────
-from mlsys.formatting import fmt, check
+from mlsys.formatting import fmt_percent, fmt, check

 class SelectionInequalityCalc:
    """1M image scenario: proxy scoring (0.6 hrs) preserves 90% compute savings vs full-model scoring."""
@@ -2232,7 +2232,7 @@ The following analysis formalizes the 10% heuristic as *the selection inequality
 # │ Imports: mlsys.formatting (fmt)
 # │ Exports: n_epochs_full_str, speedup_efficient_str, cost_total_iterative_str
 # └─────────────────────────────────────────────────────────────────────────────
-from mlsys.formatting import fmt, check
+from mlsys.formatting import fmt_percent, fmt, check

 class SelectionInequalityMath:
    """Epoch-normalized selection inequality: one-shot (9× speedup) vs. iterative (slower than baseline)."""
@@ -2260,7 +2260,7 @@ class SelectionInequalityMath:

    # ┌── 4. OUTPUT (Formatting) ─────────────────────────────────────────────
    n_epochs_full_str = fmt(n_epochs_full, precision=0, commas=False)
-    subset_fraction_pct_str = fmt(subset_fraction * 100, precision=0, commas=False)
+    subset_fraction_pct_str = fmt_percent(subset_fraction, precision=0, commas=False)
    cost_selection_full_str = fmt(cost_selection_full, precision=0, commas=False)
    n_epochs_subset_str = fmt(n_epochs_subset, precision=0, commas=False)
    cost_total_efficient_str = fmt(cost_total_efficient, precision=0, commas=False)
@@ -2424,7 +2424,7 @@ If $R > 1$ (data pipeline is the bottleneck), set echo factor $e \leq R$ to full
 # │ Imports: mlsys.formatting (fmt)
 # │ Exports: pipeline_throughput_str, pipeline_ratio_str, idle_pct_str, echo_hrs_str
 # └─────────────────────────────────────────────────────────────────────────────
-from mlsys.formatting import fmt, check
+from mlsys.formatting import fmt_percent, fmt, check

 class DataEchoingRoi:
    """ImageNet heavy augmentation: echo factor 2 cuts training from 107 hrs to 53 hrs."""
@@ -2561,7 +2561,7 @@ For a concrete example, consider training a vision model:
 # │ Imports: mlsys.formatting (fmt)
 # │ Exports: c_raw_str, c_label_str, c_total_str, p_data_str, p_compute_str
 # └─────────────────────────────────────────────────────────────────────────────
-from mlsys.formatting import fmt, check
+from mlsys.formatting import fmt_percent, fmt, check

 class CostBreakdown:
    """ImageNet-scale training cost breakdown: data costs (~81%) dominate compute (~19%)."""
@@ -2671,7 +2671,7 @@ ROI calculations assume that techniques deliver their promised benefits, but act
 # │ Imports: mlsys.formatting (fmt)
 # │ Exports: cost_random_total_str, cost_active_total_str, roi_pct_str, be_n_random_str, be_n_active_str
 # └─────────────────────────────────────────────────────────────────────────────
-from mlsys.formatting import fmt, check
+from mlsys.formatting import fmt_percent, fmt, check

 class BreakevenCalc:
    """Active learning break-even: 2K labels + $500 inference achieves same accuracy as 5K random labels."""
@@ -2768,7 +2768,7 @@ $$
 # │ Imports: mlsys.formatting (fmt)
 # │ Exports: cost_build_str, savings_per_run_str, roi_1_str, roi_50_str
 # └─────────────────────────────────────────────────────────────────────────────
-from mlsys.formatting import fmt, check
+from mlsys.formatting import fmt_percent, fmt, check

 class DeduplicationAmortization:
    """Deduplication pipeline ROI: negative at 1 run, highly profitable at 50 runs."""
@@ -3373,7 +3373,7 @@ Data selection involves counterintuitive diminishing returns that contradict the
 # │ Imports: mlsys.formatting (fmt)
 # │ Exports: Various formatted strings for inline use
 # └─────────────────────────────────────────────────────────────────────────────
-from mlsys.formatting import fmt, check
+from mlsys.formatting import fmt_percent, fmt, check

 class FpScalingCalc:
    """Quantitative backing for all Fallacies and Pitfalls in the F&P section."""
--- a/book/quarto/contents/vol1/hw_acceleration/hw_acceleration.qmd
+++ b/book/quarto/contents/vol1/hw_acceleration/hw_acceleration.qmd
@@ -186,7 +186,7 @@ To see Amdahl's Law in action, consider how the parallel fraction $p$ differs dr
 # │ Imports: mlsys.constants (H100_FLOPS_INT8), mlsys.formatting (fmt)
 # │ Exports: amdahl_*_str, hw_speedup_str, h100_tflops_int8
 # └─────────────────────────────────────────────────────────────────────────────
-from mlsys.formatting import fmt, check
+from mlsys.formatting import fmt_percent, fmt, check
 from mlsys.constants import (
    H100_FLOPS_INT8, TFLOPs, second,
    BILLION, MILLION, TRILLION, THOUSAND
@@ -232,9 +232,9 @@ class AmdahlH100:

    # ResNet
    p_resnet_str = fmt(p_resnet, precision=2, commas=False)
-    p_resnet_pct_str = fmt(p_resnet*100, precision=0, commas=False)
+    p_resnet_pct_str = fmt_percent(p_resnet, precision=0, commas=False)
    serial_resnet_str = fmt(1-p_resnet, precision=2, commas=False)
-    serial_resnet_pct_str = fmt((1-p_resnet)*100, precision=0, commas=False)
+    serial_resnet_pct_str = fmt_percent((1-p_resnet), precision=0, commas=False)
    p_resnet_per_s_str = fmt(p_resnet / hw_speedup_factor, precision=4, commas=False)
    amdahl_resnet_str = fmt(speedup_resnet, precision=1, commas=False)
    amdahl_resnet_round_str = fmt(speedup_resnet, precision=0, commas=False)
@@ -242,7 +242,7 @@ class AmdahlH100:
    # GPT-2
    p_gpt2_str = fmt(p_gpt2, precision=2, commas=False)
    serial_gpt2_str = fmt(1-p_gpt2, precision=2, commas=False)
-    serial_gpt2_pct_str = fmt((1-p_gpt2)*100, precision=0, commas=False)
+    serial_gpt2_pct_str = fmt_percent((1-p_gpt2), precision=0, commas=False)
    p_gpt2_per_s_str = fmt(p_gpt2 / hw_speedup_factor, precision=4, commas=False)
    amdahl_gpt2_str = fmt(speedup_gpt2, precision=1, commas=False)
    amdahl_gpt2_ceil_str = fmt(ceiling_gpt2, precision=0, commas=False)
@@ -604,7 +604,7 @@ from mlsys.constants import (
    MOBILE_NPU_TOPS_INT8, TFLOPs, second,
    KIB_TO_BYTES, MIB_TO_BYTES
 )
-from mlsys.formatting import fmt
+from mlsys.formatting import fmt_percent, fmt

 class CpuMlInefficiency:
    """CPU vs accelerator efficiency gap for ML workloads."""
@@ -1045,7 +1045,7 @@ output = relu(Z)  # Vector: applies activation to
 # │ Imports: mlsys.formatting (fmt)
 # │ Exports: wm_in_str, wm_out_str, wm_params_str
 # └─────────────────────────────────────────────────────────────────────────────
-from mlsys.formatting import fmt
+from mlsys.formatting import fmt_percent, fmt

 class WeightMatrixCalc:
    """Weight matrix parameter count for a single linear layer."""
@@ -1586,7 +1586,7 @@ While tensor cores package matrix operations into structured computational units
 # │ Imports: mlsys.constants (ENERGY_DRAM_ACCESS_PJ, SYSTOLIC_ARRAY_DIM)
 # │ Exports: energy_ratio_str, vector_energy_str, systolic_energy_str
 # └─────────────────────────────────────────────────────────────────────────────
-from mlsys.formatting import fmt
+from mlsys.formatting import fmt_percent, fmt
 from mlsys.constants import ENERGY_DRAM_ACCESS_PJ, SYSTOLIC_ARRAY_DIM

 # ┌── LEGO ───────────────────────────────────────────────
@@ -1796,7 +1796,7 @@ node[right]{Data};
 # │ Exports: layer_dim_str, array_dim_str, tile_count_str, reuse_factor_str
 # └─────────────────────────────────────────────────────────────────────────────
 from mlsys.constants import SYSTOLIC_ARRAY_DIM
-from mlsys.formatting import fmt, check
+from mlsys.formatting import fmt_percent, fmt, check

 # ┌── LEGO ───────────────────────────────────────────────
 class TilingPrinciple:
@@ -1898,7 +1898,7 @@ The underlying principle remains consistent: data flows systematically through p
 # │ Imports: mlsys.constants (SYSTOLIC_ARRAY_DIM), mlsys.formatting (fmt)
 # │ Exports: systolic_dim_str, systolic_ops_str
 # └─────────────────────────────────────────────────────────────────────────────
-from mlsys.formatting import fmt
+from mlsys.formatting import fmt_percent, fmt
 from mlsys.constants import SYSTOLIC_ARRAY_DIM

 class SystolicOpsCalc:
@@ -2370,7 +2370,7 @@ To make these energy costs concrete, we can trace a single tensor through every
 # │          dram_energy_pj_bit_str, latency_hbm_str, latency_l2_str,
 # │          latency_l1_str, a100_tflops_fp16, reg_energy_pj_bit_str
 # └─────────────────────────────────────────────────────────────────────────────
-from mlsys.formatting import fmt
+from mlsys.formatting import fmt_percent, fmt
 from mlsys.constants import (
    BYTES_FP16, ENERGY_DRAM_PJ_PER_BYTE,
    LATENCY_HBM3, LATENCY_L2_CACHE, LATENCY_L1_REGISTER,
@@ -2950,7 +2950,7 @@ from mlsys.constants import (
    H100_MEM_BW, H100_FLOPS_FP16_TENSOR,
    flop, byte, TB, second
 )
-from mlsys.formatting import fmt
+from mlsys.formatting import fmt_percent, fmt

 # ┌── LEGO ───────────────────────────────────────────────
 class RooflineGap:
@@ -3059,7 +3059,7 @@ To see how these intensity values translate into real performance predictions, a
 # │ Imports: mlsys.constants (TRANSFORMER_*, BYTES_FP16)
 # │ Exports: t_*_str, qkv_*_str, softmax_*_str
 # └─────────────────────────────────────────────────────────────────────────────
-from mlsys.formatting import fmt
+from mlsys.formatting import fmt_percent, fmt
 from mlsys.constants import (
    byte, MB, flop, GFLOPs, MFLOPs,
    TRANSFORMER_HIDDEN_DIM_EXAMPLE, TRANSFORMER_SEQ_LEN_EXAMPLE,
@@ -3171,7 +3171,7 @@ A *convolutional layer analysis* demonstrates how these formulas apply in practi
 # │          conv_input_mb_str, conv_weights_mb_str, conv_output_mb_str,
 # │          conv_total_mb_str, conv_ai_str
 # └─────────────────────────────────────────────────────────────────────────────
-from mlsys.formatting import fmt
+from mlsys.formatting import fmt_percent, fmt
 from mlsys.constants import BYTES_FP16, byte, MB, flop, GFLOPs

 class Conv2dAnalysisCalc:
@@ -3266,7 +3266,7 @@ However, not all layers in a neural network exhibit this favorable profile. The
 # │          dense_output_kb_str, dense_total_mb_str, dense_ai_str,
 # │          dense_attainable_str, dense_util_pct_str, a100_bw
 # └─────────────────────────────────────────────────────────────────────────────
-from mlsys.formatting import fmt
+from mlsys.formatting import fmt_percent, fmt
 from mlsys.constants import (
    BYTES_FP16, byte, MB, KiB, flop, MFLOPs, GFLOPs, TFLOPs,
    A100_MEM_BW, A100_FLOPS_FP16_TENSOR, GB, second,
@@ -3365,7 +3365,7 @@ The situation becomes even more extreme for element-wise operations like normali
 # │          ln_params_kb_str, ln_output_mb_str, ln_total_mb_str,
 # │          ln_ai_str, ln_attainable_str, a100_bw_str
 # └─────────────────────────────────────────────────────────────────────────────
-from mlsys.formatting import fmt
+from mlsys.formatting import fmt_percent, fmt
 from mlsys.constants import byte, MB, A100_MEM_BW, GB, GFLOPs, TFLOPs, second

 class LayernormAnalysisCalc:
@@ -3500,7 +3500,7 @@ For workloads where batching is impractical, such as interactive LLM generation
 # │ Imports: mlsys.constants (GPT2_PARAMS, A100_*, BYTES_FP16)
 # │ Exports: gpt2_*_str, a100_tflops_fp32
 # └─────────────────────────────────────────────────────────────────────────────
-from mlsys.formatting import fmt
+from mlsys.formatting import fmt_percent, fmt
 from mlsys.formulas import model_memory
 from mlsys.constants import (
    GB, GPT2_PARAMS, BYTES_FP16, flop, GFLOPs,
@@ -4004,7 +4004,7 @@ Building on software optimization techniques from @sec-model-compression and mem
 # │ Imports: mlsys.constants (BYTES_FP32, byte, MB), mlsys.formatting (fmt)
 # │ Exports: tensor_mb_str, total_mb_str, footprint_ratio_str
 # └─────────────────────────────────────────────────────────────────────────────
-from mlsys.formatting import fmt
+from mlsys.formatting import fmt_percent, fmt
 from mlsys.constants import BYTES_FP32, byte, MB

 class MemoryFootprintCalc:
@@ -4075,7 +4075,7 @@ Each operation produces an intermediate tensor that must be written to memory an
 # │ Imports: mlsys.constants (BYTES_FP32), mlsys.formatting (fmt)
 # │ Exports: tensor_mb_str, total_mb_str
 # └─────────────────────────────────────────────────────────────────────────────
-from mlsys.formatting import fmt
+from mlsys.formatting import fmt_percent, fmt
 from mlsys.constants import BYTES_FP32

 class MemoryFootprintTableCalc:
@@ -4474,7 +4474,7 @@ Machine learning compilers automate the translation of dataflow strategies into
 # │ Imports: mlsys.formatting (fmt)
 # │ Exports: naive_inference_ms, optimized_inference_ms, compiler_speedup_str
 # └─────────────────────────────────────────────────────────────────────────────
-from mlsys.formatting import fmt
+from mlsys.formatting import fmt_percent, fmt

 class CompilerSpeedupCalc:
    """ResNet-50 latency improvement from ML compiler graph and memory optimizations."""
@@ -4913,7 +4913,7 @@ Engineers assume specialized accelerators automatically outperform general-purpo
 # │ Exports: fp_ridge_example_str, fp_layernorm_tflops_str,
 # │          fp_layernorm_util_str
 # └─────────────────────────────────────────────────────────────────────────────
-from mlsys.formatting import fmt
+from mlsys.formatting import fmt_percent, fmt
 from mlsys.constants import ENERGY_DRAM_ACCESS_PJ, ENERGY_SRAM_L1_PJ

 class FpMemoryEnergyCalc:
@@ -4965,7 +4965,7 @@ Practitioners focus on peak TFLOPS without analyzing whether their workloads can
 # │          mlsys.formatting (fmt)
 # │ Exports: fp_nvlink_bw_str, fp_sync_time_str, fp_sync_overhead_str
 # └─────────────────────────────────────────────────────────────────────────────
-from mlsys.formatting import fmt
+from mlsys.formatting import fmt_percent, fmt
 from mlsys.constants import NVLINK_A100_BW, GB, second

 class FpMultigpuScalingCalc:
@@ -5017,7 +5017,7 @@ Vendors advertise peak FLOPS as the definitive measure of accelerator capability
 # │ Exports: fp_ai_b1_str, fp_ai_b256_str, fp_t4_ridge_str,
 # │          fp_t4_flops_str, fp_t4_bw_str
 # └─────────────────────────────────────────────────────────────────────────────
-from mlsys.formatting import fmt
+from mlsys.formatting import fmt_percent, fmt
 from mlsys.constants import T4_FLOPS_FP16_TENSOR, T4_MEM_BW, TFLOPs, second, GB

 class FpSmallBatchCalc:
@@ -5084,7 +5084,7 @@ Organizations optimize exclusively for specific vendors to maximize performance
 # │          mlsys.constants (GB, BYTES_FP16)
 # │ Exports: headroom_str, token_latency_ms_str, frame_budget_str
 # └─────────────────────────────────────────────────────────────────────────────
-from mlsys.formatting import fmt, check
+from mlsys.formatting import fmt_percent, fmt, check
 from mlsys.formulas import model_memory
 from mlsys.constants import GB, BYTES_FP16

@@ -5186,7 +5186,7 @@ Beyond raw performance, we must evaluate hardware through the lens of *silicon s
 # │          cpu_energy_day_str, npu_energy_day_str, carbon_intensity_str,
 # │          co2_saved_str
 # └─────────────────────────────────────────────────────────────────────────────
-from mlsys.formatting import fmt, check
+from mlsys.formatting import fmt_percent, fmt, check
 from mlsys.constants import DAYS_PER_YEAR

 class CarbonRoiCalc:
--- a/book/quarto/contents/vol1/introduction/introduction.qmd
+++ b/book/quarto/contents/vol1/introduction/introduction.qmd
@@ -91,7 +91,7 @@ from mlsys.constants import (
    second,
    BILLION, MILLION, TRILLION, THOUSAND
 )
-from mlsys.formatting import fmt, check
+from mlsys.formatting import fmt_percent, fmt, check

 # ┌── LEGO ───────────────────────────────────────────────
 class AIMomentStats:
@@ -207,7 +207,7 @@ from mlsys.constants import (
    COLOR_DEPTH_8BIT,
    IMAGENET_TEST_IMAGES,
 )
-from mlsys.formatting import fmt, check
+from mlsys.formatting import fmt_percent, fmt, check

 # ┌── LEGO ───────────────────────────────────────────────
 class VerificationGap:
@@ -561,7 +561,7 @@ This hybrid approach combined human-engineered features with statistical learnin
 # └─────────────────────────────────────────────────────────────────────────────
 from mlsys import Models
 from mlsys.constants import MILLION
-from mlsys.formatting import fmt, check
+from mlsys.formatting import fmt_percent, fmt, check

 # ┌── LEGO ───────────────────────────────────────────────
 class AlexNetBreakthrough:
@@ -1036,7 +1036,7 @@ node[below]{dense}(X1-|B1D.north west);
 # └─────────────────────────────────────────────────────────────────────────────
 from mlsys import Hardware, Models
 from mlsys.constants import Bparam, ZFLOPs, byte, GB
-from mlsys.formatting import fmt, check
+from mlsys.formatting import fmt_percent, fmt, check

 # ┌── LEGO ───────────────────────────────────────────────
 class GPT3Scale:
@@ -1116,7 +1116,7 @@ The shift from expert systems to statistical learning to deep learning has drama
 # └─────────────────────────────────────────────────────────────────────────────
 from mlsys import Models
 from mlsys.constants import MILLION
-from mlsys.formatting import fmt, check
+from mlsys.formatting import fmt_percent, fmt, check

 # ┌── LEGO ───────────────────────────────────────────────
 class GPT4Scale:
@@ -1201,7 +1201,7 @@ Rather than beginning with an abstract definition, consider a system most people
 # │ Exports: gmail_emails_t_str
 # └─────────────────────────────────────────────────────────────────────────────
 from mlsys.constants import GMAIL_EMAILS_PER_DAY, TRILLION
-from mlsys.formatting import fmt, check
+from mlsys.formatting import fmt_percent, fmt, check

 # ┌── LEGO ───────────────────────────────────────────────
 class EmailScale:
@@ -1537,7 +1537,7 @@ from mlsys.constants import (
    TRILLION, SEC_PER_DAY, flop
 )
 from mlsys.formulas import dTime
-from mlsys.formatting import fmt, md_math
+from mlsys.formatting import fmt_percent, fmt, md_math

 # --- Inputs (cluster configuration) ---
 num_gpus_value = 1024
@@ -1597,8 +1597,8 @@ class GPT3Training:
    # ┌── 4. OUTPUT (Formatting) ──────────────────────────────────────────────
    # Text strings
    num_gpus_str = fmt(num_gpus, precision=0, commas=False)
-    eta_base_pct_str = fmt(eta_base * 100, precision=0, commas=False)
-    eta_opt_pct_str = fmt(eta_opt * 100, precision=0, commas=False)
+    eta_base_pct_str = fmt_percent(eta_base, precision=0, commas=False)
+    eta_opt_pct_str = fmt_percent(eta_opt, precision=0, commas=False)
    days_initial_str = fmt(days_base, precision=0, commas=False)
    days_optimized_str = fmt(days_opt, precision=0, commas=False)
    days_saved_str = fmt(days_saved, precision=0, commas=False)
@@ -1726,7 +1726,7 @@ Each archetype manifests different constraints along the D·A·M axes, ensuring
 # │ Exports: imagenet_images_m_str
 # └─────────────────────────────────────────────────────────────────────────────
 from mlsys.constants import IMAGENET_IMAGES, MILLION
-from mlsys.formatting import fmt, check
+from mlsys.formatting import fmt_percent, fmt, check

 # ┌── LEGO ───────────────────────────────────────────────
 class ImageNetStats:
@@ -2173,7 +2173,7 @@ fill=OrangeL,draw=OrangeLine](DB2){Model\ Monitoring};
 # │ Exports: scenario_* strings for mission and constraints.
 # └─────────────────────────────────────────────────────────────────────────────
 from mlsys import Applications
-from mlsys.formatting import fmt
+from mlsys.formatting import fmt_percent, fmt

 # ┌── LEGO ───────────────────────────────────────────────
 class ScenarioRegistry:
@@ -2296,7 +2296,7 @@ The interdependencies across the D·A·M axes create specific challenge categori
 # │ Exports: waymo_data_low_str, waymo_data_high_str
 # └─────────────────────────────────────────────────────────────────────────────
 from mlsys.constants import WAYMO_DATA_PER_HOUR_LOW, WAYMO_DATA_PER_HOUR_HIGH, TB, hour
-from mlsys.formatting import fmt, check
+from mlsys.formatting import fmt_percent, fmt, check

 # ┌── LEGO ───────────────────────────────────────────────
 class WaymoStats:
@@ -2438,7 +2438,7 @@ Engineers assume benchmark performance predicts production accuracy, but distrib
 # │          total_ms, new_total_ms, improv_pct, naive_p
 # └─────────────────────────────────────────────────────────────────────────────
 from mlsys.formulas import calc_amdahls_speedup
-from mlsys.formatting import fmt, check
+from mlsys.formatting import fmt_percent, fmt, check

 # --- Inputs (hypothetical pipeline timings) ---
 t_inference_value = 45  # ms
@@ -2525,7 +2525,7 @@ Engineers optimize inference latency in isolation, but **Amdahl's Law** governs
 # │ Imports: mlsys.formatting (fmt)
 # │ Exports: acc_initial_str, acc_final_str, acc_drop_str, months_str
 # └─────────────────────────────────────────────────────────────────────────────
-from mlsys.formatting import fmt, check
+from mlsys.formatting import fmt_percent, fmt, check

 # ┌── LEGO ───────────────────────────────────────────────
 class DriftFallacy:
@@ -2611,7 +2611,7 @@ This book makes a stronger claim: ML systems engineering is not merely a collect
 # └─────────────────────────────────────────────────────────────────────────────
 from mlsys import Systems
 from mlsys.constants import GB, MB, KiB, watt, milliwatt, TFLOPs, second, flop
-from mlsys.formatting import fmt, check
+from mlsys.formatting import fmt_percent, fmt, check

 # ┌── LEGO ───────────────────────────────────────────────
 class DeploymentSystems:
--- a/book/quarto/contents/vol1/ml_systems/ml_systems.qmd
+++ b/book/quarto/contents/vol1/ml_systems/ml_systems.qmd
@@ -88,7 +88,7 @@ from mlsys.constants import (
    MOBILE_LATENCY_RANGE_MS, TINY_LATENCY_RANGE_MS,
    MOBILE_RAM_RANGE_GB, MOBILE_STORAGE_RANGE, MOBILE_TDP_RANGE_W
 )
-from mlsys.formatting import fmt, check
+from mlsys.formatting import fmt_percent, fmt, check

 # ┌── LEGO ───────────────────────────────────────────────
 class MLSystemsSetup:
@@ -443,7 +443,7 @@ $$\text{Latency}_{\min} = \frac{2 \times \text{Distance}}{c_{\text{fiber}}} \app
 # │ Exports: min_latency_str, distance_str
 # └─────────────────────────────────────────────────────────────────────────────
 from mlsys.constants import SPEED_OF_LIGHT_FIBER_KM_S, ureg
-from mlsys.formatting import fmt, check
+from mlsys.formatting import fmt_percent, fmt, check

 # ┌── LEGO ───────────────────────────────────────────────
 class LightLatency:
@@ -503,7 +503,7 @@ Doubling clock frequency required approximately 8$\times$ more power. The breakd
 # │ Imports: mlsys.formatting (fmt)
 # │ Exports: compute_growth_str, mem_bw_growth_str, mem_wall_ratio_str
 # └─────────────────────────────────────────────────────────────────────────────
-from mlsys.formatting import fmt, check
+from mlsys.formatting import fmt_percent, fmt, check

 # ┌── LEGO ───────────────────────────────────────────────
 class MemoryWall:
@@ -592,7 +592,7 @@ This principle dictates that if your system is **Memory Bound**\index{memory-bou
 # │ Imports: mlsys.formatting (fmt)
 # │ Exports: et_*_str variables for callout
 # └─────────────────────────────────────────────────────────────────────────────
-from mlsys.formatting import fmt, check
+from mlsys.formatting import fmt_percent, fmt, check

 # ┌── LEGO ───────────────────────────────────────────────
 class EnergyTransmission:
@@ -733,7 +733,7 @@ from mlsys import Models
 from mlsys.constants import (
    RESNET50_FLOPs, GFLOPs, Mparam, Bparam, Kparam, byte, MB, GB, KB
 )
-from mlsys.formatting import fmt, check
+from mlsys.formatting import fmt_percent, fmt, check

 # ┌── LEGO ───────────────────────────────────────────────
 class LighthouseModels:
@@ -977,7 +977,7 @@ The following worked example demonstrates how to apply this analysis quantitativ
 # └─────────────────────────────────────────────────────────────────────────────

 from mlsys.constants import RESNET50_FLOPs, RESNET50_PARAMS, GFLOPs, Mparam, byte, MB
-from mlsys.formatting import fmt, check
+from mlsys.formatting import fmt_percent, fmt, check

 # ┌── LEGO ───────────────────────────────────────────────
 class ResnetSetup:
@@ -1031,7 +1031,7 @@ from mlsys.constants import (
    TFLOPs, second, TB, byte, flop,
 )
 from mlsys.formulas import calc_bottleneck
-from mlsys.formatting import sci, fmt, sci_latex, md_frac
+from mlsys.formatting import sci, fmt_percent, fmt, sci_latex, md_frac

 # ┌── LEGO ───────────────────────────────────────────────
 class ResnetCloud:
@@ -1108,7 +1108,7 @@ from mlsys.constants import (
    TFLOPs, second, GB, byte, flop,
 )
 from mlsys.formulas import calc_bottleneck
-from mlsys.formatting import sci_latex, md_frac, fmt
+from mlsys.formatting import sci_latex, md_frac, fmt_percent, fmt

 # ┌── LEGO ───────────────────────────────────────────────
 class ResnetMobile:
@@ -1236,7 +1236,7 @@ from mlsys.constants import (
    ESP32_RAM, ESP32_FLASH, ESP32_POWER_MIN, ESP32_POWER_MAX, ESP32_PRICE,
    TB, GB, KiB, MB, watt, USD
 )
-from mlsys.formatting import fmt, check
+from mlsys.formatting import fmt_percent, fmt, check

 # ┌── LEGO ───────────────────────────────────────────────
 class HardwareSpectrumSetup:
@@ -1440,7 +1440,7 @@ above=1of $(B2.north east)!0.5!(B3.north west)$](B0){Cloud ML};

 from mlsys.constants import SPEED_OF_LIGHT_FIBER_KM_S
 from mlsys.formulas import calc_network_latency_ms
-from mlsys.formatting import fmt, check
+from mlsys.formatting import fmt_percent, fmt, check

 # ┌── LEGO ───────────────────────────────────────────────
 class DistancePenalty:
@@ -1544,7 +1544,7 @@ from mlsys.constants import (
    CLOUD_ELECTRICITY_PER_KWH, USD, GB, watt, ureg,
    MILLION, MIB_TO_BYTES,
 )
-from mlsys.formatting import fmt, check
+from mlsys.formatting import fmt_percent, fmt, check

 # ┌── LEGO ───────────────────────────────────────────────
 class CloudEdgeTCO:
@@ -1739,7 +1739,7 @@ from mlsys.constants import (
    BILLION, TRILLION, SEC_PER_HOUR, HOURS_PER_DAY,
    BITS_PER_BYTE, KIB_TO_BYTES, MIB_TO_BYTES, MS_PER_SEC
 )
-from mlsys.formatting import fmt, check
+from mlsys.formatting import fmt_percent, fmt, check

 # ┌── LEGO ───────────────────────────────────────────────
 class VoiceAssistantWall:
@@ -1973,7 +1973,7 @@ The benefits of lower bandwidth usage and reduced latency become stark when we e
 # └─────────────────────────────────────────────────────────────────────────────
 from mlsys import Hardware
 from mlsys.formulas import calc_monthly_egress_cost
-from mlsys.formatting import fmt, check
+from mlsys.formatting import fmt_percent, fmt, check
 from mlsys.constants import (
    VIDEO_1080P_WIDTH, VIDEO_1080P_HEIGHT, VIDEO_BYTES_PER_PIXEL_RGB,
    VIDEO_FPS_STANDARD, CLOUD_EGRESS_PER_GB, MB, GB, second, MILLION,
@@ -2179,7 +2179,7 @@ To make these trade-offs concrete, the following worked example applies *edge in
 from mlsys import Hardware, Models
 from mlsys.constants import GFLOPs, CLOUD_ELECTRICITY_PER_KWH, HOURS_PER_YEAR, TFLOPs, USD, watt, ureg
 from mlsys.formulas import calc_fleet_tco
-from mlsys.formatting import fmt, check
+from mlsys.formatting import fmt_percent, fmt, check

 # ┌── LEGO ───────────────────────────────────────────────
 class EdgeSizing:
@@ -2481,7 +2481,7 @@ The battery life and resource constraints listed above translate directly into e
 # └─────────────────────────────────────────────────────────────────────────────
 from mlsys import Hardware
 from mlsys.constants import OBJECT_DETECTOR_POWER_W, ureg
-from mlsys.formatting import md_frac, fmt
+from mlsys.formatting import md_frac, fmt_percent, fmt

 # ┌── LEGO ───────────────────────────────────────────────
 class BatteryTax:
@@ -2549,7 +2549,7 @@ The battery constraint limits total energy consumption over time. However, even
 # │ Exports: baseline_str, quant_power_str, quant_red_str
 # └─────────────────────────────────────────────────────────────────────────────

-from mlsys.formatting import fmt, check
+from mlsys.formatting import fmt_percent, fmt, check

 # ┌── LEGO ───────────────────────────────────────────────
 class ThermalQuantCalc:
@@ -2675,7 +2675,7 @@ from mlsys.constants import (
    BATTERY_CAPACITY_MAH, BATTERY_VOLTAGE_V, BATTERY_ENERGY_J,
    ENERGY_MOBILENET_INF_MJ, ureg, BILLION
 )
-from mlsys.formatting import fmt, check
+from mlsys.formatting import fmt_percent, fmt, check

 # ┌── LEGO ───────────────────────────────────────────────
 class EnergyInference:
@@ -3503,7 +3503,7 @@ A related misconception holds that moving computation closer to the user always
 # │          low_power_frac, high_power_frac
 # └─────────────────────────────────────────────────────────────────────────────

-from mlsys.formatting import fmt, check, md_frac
+from mlsys.formatting import fmt_percent, fmt, check, md_frac

 # ┌── LEGO ───────────────────────────────────────────────
 class MobilePowerFallacyCalc:
@@ -3561,7 +3561,7 @@ The difference is qualitative, not just quantitative. As @sec-ml-systems-tinyml-
 # │          edge_reliability_str, edge_total_str, tco_ratio_str
 # └─────────────────────────────────────────────────────────────────────────────

-from mlsys.formatting import fmt, check
+from mlsys.formatting import fmt_percent, fmt, check

 # ┌── LEGO ───────────────────────────────────────────────
 class TcoPitfallCalc:
@@ -3620,7 +3620,7 @@ Teams optimize per-unit resource consumption while ignoring operational overhead
 # │ Exports: cam_*_str variables for prose
 # └─────────────────────────────────────────────────────────────────────────────

-from mlsys.formatting import fmt, check
+from mlsys.formatting import fmt_percent, fmt, check

 # ┌── LEGO ───────────────────────────────────────────────
 class AmdahlCameraCalc:
@@ -3649,8 +3649,8 @@ class AmdahlCameraCalc:
    cam_ml_str = fmt(cam_ml_ms_value, precision=0, commas=False)                   # "60"
    cam_post_str = fmt(cam_post_ms_value, precision=0, commas=False)               # "40"
    cam_total_str = fmt(cam_total_ms_value, precision=0, commas=False)             # "200"
-    cam_ml_pct_str = fmt(cam_ml_frac_value * 100, precision=0, commas=False)       # "30"
-    cam_non_ml_pct_str = fmt(cam_non_ml_frac_value * 100, precision=0, commas=False)  # "70"
+    cam_ml_pct_str = fmt_percent(cam_ml_frac_value, precision=0, commas=False)       # "30"
+    cam_non_ml_pct_str = fmt_percent(cam_non_ml_frac_value, precision=0, commas=False)  # "70"
    cam_speedup_10x_str = fmt(cam_speedup_10x_value, precision=2, commas=False)    # "1.37"
    cam_speedup_inf_str = fmt(cam_speedup_inf_value, precision=2, commas=False)    # "1.43"
    cam_ml_opt_str = fmt(cam_ml_optimized_ms_value, precision=0, commas=False)     # "6"
--- a/book/quarto/contents/vol1/training/training.qmd
+++ b/book/quarto/contents/vol1/training/training.qmd
@@ -93,7 +93,7 @@ Inference computes a single forward pass: data flows through the network, a pred
 # └─────────────────────────────────────────────────────────────────────────────
 from mlsys import Hardware, Models
 from mlsys.constants import *
-from mlsys.formatting import fmt, sci, md_math, check
+from mlsys.formatting import fmt_percent, fmt, sci, md_math, check
 from mlsys.formulas import model_memory

 # ┌── LEGO ───────────────────────────────────────────────
@@ -700,7 +700,7 @@ To illustrate the scale of these operations concretely, consider the *attention
 # └─────────────────────────────────────────────────────────────────────────────
 from mlsys import Hardware, Models
 from mlsys.constants import TFLOPs, second, GPT2_HIDDEN_DIM, GPT2_LAYERS
-from mlsys.formatting import fmt, check
+from mlsys.formatting import fmt_percent, fmt, check

 # ┌── LEGO ───────────────────────────────────────────────
 class GPT2Compute:
@@ -1033,7 +1033,7 @@ $$\begin{aligned}
 # │          resnet50_param_mem_b32_mb_str, resnet50_act_mem_b64_gb_str,
 # │          resnet50_grad_mem_b64_gb_str
 # └─────────────────────────────────────────────────────────────────────────────
-from mlsys.formatting import fmt, check
+from mlsys.formatting import fmt_percent, fmt, check

 # ┌── LEGO ───────────────────────────────────────────────
 class ResNetBatchMemory:
@@ -1121,7 +1121,7 @@ v_t = \beta_2 v_{t-1} + (1-\beta_2)\big(\nabla \mathcal{L}(\theta_t)\big)^2
 # │ Exports: adam_overhead_str
 # └─────────────────────────────────────────────────────────────────────────────
 from mlsys.constants import BYTES_FP32, byte, Mparam
-from mlsys.formatting import fmt, check
+from mlsys.formatting import fmt_percent, fmt, check

 # ┌── LEGO ───────────────────────────────────────────────
 class AdamMemory:
@@ -1187,7 +1187,7 @@ The choice of optimization algorithm creates specific patterns of computation an
 # └─────────────────────────────────────────────────────────────────────────────
 from mlsys import Hardware, Models
 from mlsys.constants import BYTES_FP32, BYTES_FP16, GB, GiB
-from mlsys.formatting import fmt, check
+from mlsys.formatting import fmt_percent, fmt, check
 from mlsys.formulas import model_memory

 # ┌── LEGO ───────────────────────────────────────────────
@@ -1438,7 +1438,7 @@ Here, we shift focus from *what* backpropagation computes to *what it costs* to
 # └─────────────────────────────────────────────────────────────────────────────
 from mlsys import Hardware, Models
 from mlsys.constants import BYTES_FP16, BYTES_ADAM_STATE, GB, MB, GiB, GPT2_HIDDEN_DIM, GPT2_LAYERS
-from mlsys.formatting import fmt, check
+from mlsys.formatting import fmt_percent, fmt, check
 from mlsys.formulas import model_memory

 # ┌── LEGO ───────────────────────────────────────────────
@@ -1729,7 +1729,7 @@ plt.show()
 # │       a100_bw_tbs_str, a100_ridge_str, h100_tflops_fp16_str,
 # │       h100_bw_tbs_str, h100_ridge_str — all defined in training-setup.
 # └─────────────────────────────────────────────────────────────────────────────
-from mlsys.formatting import fmt, check
+from mlsys.formatting import fmt_percent, fmt, check

 # ┌── LEGO ───────────────────────────────────────────────
 class AttentionIntensity:
@@ -2071,7 +2071,7 @@ Applying this throughput analysis to our GPT-2 Lighthouse Model reveals where th
 # │          parallel_tokenization_ms_str
 # └─────────────────────────────────────────────────────────────────────────────
 from mlsys.constants import PCIE_GEN3_BW, GB, second
-from mlsys.formatting import fmt, check
+from mlsys.formatting import fmt_percent, fmt, check

 # ┌── LEGO ───────────────────────────────────────────────
 class GPT2DataPipeline:
@@ -2175,7 +2175,7 @@ While data pipeline throughput determines how fast training data reaches the GPU
 # │ Exports: model_params_b_str, gradient_size_str, allreduce_str,
 # │          network_time_str, network_bw_str
 # └─────────────────────────────────────────────────────────────────────────────
-from mlsys.formatting import fmt, check
+from mlsys.formatting import fmt_percent, fmt, check
 from mlsys.constants import BYTES_FP16, ALLREDUCE_FACTOR

 # ┌── LEGO ───────────────────────────────────────────────
@@ -2343,7 +2343,7 @@ These hardware utilization patterns reinforce the batch-size--utilization relati
 # │          vram_hidden_str, vram_layers_str, vram_activations_gb_str
 # └─────────────────────────────────────────────────────────────────────────────
 from mlsys.constants import BYTES_FP16, BYTES_FP32, BYTES_ADAM_STATE, byte
-from mlsys.formatting import fmt, check
+from mlsys.formatting import fmt_percent, fmt, check

 # ┌── LEGO ───────────────────────────────────────────────
 class VRAMRequirements:
@@ -2435,7 +2435,7 @@ The total memory scales linearly with batch size (as established in @eq-activati
 # └─────────────────────────────────────────────────────────────────────────────
 from mlsys import Hardware, Models
 from mlsys.constants import BYTES_FP32, BYTES_FP16, GB, MB, Mparam, Bparam
-from mlsys.formatting import fmt, check
+from mlsys.formatting import fmt_percent, fmt, check
 from mlsys.formulas import model_memory

 # ┌── LEGO ───────────────────────────────────────────────
@@ -2676,7 +2676,7 @@ from mlsys.constants import (
    BILLION, TRILLION, MILLION, THOUSAND,
    TFLOPs, TRILLION, SEC_PER_DAY, SEC_PER_YEAR_LEAP, HOURS_PER_DAY
 )
-from mlsys.formatting import fmt, check
+from mlsys.formatting import fmt_percent, fmt, check

 # ┌── LEGO ───────────────────────────────────────────────
 class LlamaTraining:
@@ -2742,7 +2742,7 @@ class LlamaTraining:
    ub_params_b_str = fmt(params/BILLION, precision=0, commas=False)
    ub_tokens_t_str = fmt(tokens/TRILLION, precision=0, commas=False)
    ub_peak_tflops_str = fmt(peak_tflops, precision=0, commas=True)
-    ub_utilization_pct_str = fmt(utilization*100, precision=0, commas=False)
+    ub_utilization_pct_str = fmt_percent(utilization, precision=0, commas=False)
    ub_effective_tflops_str = fmt(effective_tflops, precision=0, commas=False)
    ub_num_gpus_str = f"{num_gpus:,}"
    ub_rental_rate_str = fmt(rental_rate, precision=0, commas=False)
@@ -3279,7 +3279,7 @@ These benefits compound: a practitioner might simultaneously double batch size (
 # │          ckpt_total_str, v100_capacity_str
 # └─────────────────────────────────────────────────────────────────────────────
 from mlsys.constants import GPT2_PARAMS, Mparam, Bparam, BYTES_FP32, BYTES_FP16, BYTES_ADAM_STATE, GB
-from mlsys.formatting import fmt, check
+from mlsys.formatting import fmt_percent, fmt, check
 from mlsys.formulas import model_memory

 # ┌── LEGO ───────────────────────────────────────────────
@@ -3392,7 +3392,7 @@ model_1b_fp16_gb_str = MixedPrecisionMemory.model_1b_fp16_gb_str
 # │ Imports: mlsys.formatting (fmt)
 # │ Exports: v100_mp_speedup_str
 # └─────────────────────────────────────────────────────────────────────────────
-from mlsys.formatting import fmt, check
+from mlsys.formatting import fmt_percent, fmt, check

 # ┌── LEGO ───────────────────────────────────────────────
 class MixedPrecisionSpeedup:
@@ -3606,7 +3606,7 @@ Optimal mixed-precision training requires matching the precision format to hardw
 # │ Imports: mlsys.formatting (fmt)
 # │ Exports: v100_fp16_speedup_str, a100_over_v100_str, h100_over_v100_str
 # └─────────────────────────────────────────────────────────────────────────────
-from mlsys.formatting import fmt, check
+from mlsys.formatting import fmt_percent, fmt, check

 # ┌── LEGO ───────────────────────────────────────────────
 class CrossGenPrecisionCalc:
@@ -3675,7 +3675,7 @@ $$ \text{Attention}(Q, K, V) = \text{softmax}\left(\frac{QK^T}{\sqrt{d_k}}\right
 # │          attn_matrix_mb_str, total_attn_gb_str
 # └─────────────────────────────────────────────────────────────────────────────
 from mlsys.constants import BYTES_FP32, MB, GB, byte
-from mlsys.formatting import fmt, check
+from mlsys.formatting import fmt_percent, fmt, check

 # ┌── LEGO ───────────────────────────────────────────────
 class AttentionMemoryCalc:
@@ -3773,7 +3773,7 @@ Flash Attention achieves asymptotic improvements in both memory footprint and me
 # │ Exports: fa_standard_mb_str, fa_flash_mb_str, fa_reduction_str
 # └─────────────────────────────────────────────────────────────────────────────
 from mlsys.constants import BYTES_FP32, MB
-from mlsys.formatting import fmt, check
+from mlsys.formatting import fmt_percent, fmt, check

 # ┌── LEGO ───────────────────────────────────────────────
 class FlashAttentionCalc:
@@ -3891,7 +3891,7 @@ The benefits of Flash Attention become concrete when measured on real hardware.
 # │ Imports: mlsys.formatting (fmt)
 # │ Exports: flash_fwd_speedup_str, flash_bwd_speedup_str
 # └─────────────────────────────────────────────────────────────────────────────
-from mlsys.formatting import fmt, check
+from mlsys.formatting import fmt_percent, fmt, check

 # ┌── LEGO ───────────────────────────────────────────────
 class FlashAttentionSpeedup:
@@ -4187,7 +4187,7 @@ Returning to our GPT-2 Lighthouse Model, *gradient accumulation* is essential fo
 # │          accum_2wk_str, naive_2wk_str, comm_reduction_pct_str,
 # │          accum_steps_str
 # └─────────────────────────────────────────────────────────────────────────────
-from mlsys.formatting import fmt, check
+from mlsys.formatting import fmt_percent, fmt, check

 # ┌── LEGO ───────────────────────────────────────────────
 class GradientAccumulation:
@@ -4378,7 +4378,7 @@ To answer that question, let us walk through optimizing GPT-2 (1.5B parameters)
 # │          amp_reduction_str, recompute_overhead_str, checkpoint_factor_str
 # └─────────────────────────────────────────────────────────────────────────────
 from mlsys.constants import GPT2_PARAMS, GPT2_LAYERS, GPT2_HIDDEN_DIM, V100_MEM_CAPACITY, GiB, BYTES_FP32, BYTES_FP16, GB, byte
-from mlsys.formatting import fmt, check
+from mlsys.formatting import fmt_percent, fmt, check
 from mlsys.formulas import model_memory

 # ┌── LEGO ───────────────────────────────────────────────
@@ -4543,7 +4543,7 @@ The GPT-2 case study demonstrates how the optimization techniques examined in th
 # │ Imports: mlsys.formatting (fmt)
 # │ Exports: b_param_str..b_carbon_str, o_param_str..o_carbon_str
 # └─────────────────────────────────────────────────────────────────────────────
-from mlsys.formatting import fmt, check
+from mlsys.formatting import fmt_percent, fmt, check

 # ┌── LEGO ───────────────────────────────────────────────
 class GPT2SummaryCalc:
@@ -4657,7 +4657,7 @@ o_carbon_str = GPT2SummaryCalc.o_carbon_str
 # │ Imports: mlsys.formatting (fmt)
 # │ Exports: mem_reduction_str, energy_reduction_pct_str, time_speedup_str
 # └─────────────────────────────────────────────────────────────────────────────
-from mlsys.formatting import fmt, check
+from mlsys.formatting import fmt_percent, fmt, check

 # ┌── LEGO ───────────────────────────────────────────────
 class OptimizationSummaryCalc:
@@ -5062,7 +5062,7 @@ from mlsys.constants import (
    A100_TDP, watt, GPUS_PER_HOST, BILLION, TRILLION,
    SEC_PER_HOUR, HOURS_PER_DAY, THOUSAND
 )
-from mlsys.formatting import fmt, check
+from mlsys.formatting import fmt_percent, fmt, check

 # ┌── LEGO ───────────────────────────────────────────────
 class TrainingCarbonFootprint:
@@ -5218,7 +5218,7 @@ The journey from single-GPU optimization through multi-device parallelism reveal
 # │ Exports: fp_model_20b_params_str..fp_prefetch_reduction_str (~30 vars)
 # └─────────────────────────────────────────────────────────────────────────────
 from mlsys.constants import *
-from mlsys.formatting import fmt, check
+from mlsys.formatting import fmt_percent, fmt, check

 class FallaciesPitfallsSetup:
    """Quantitative values for all Fallacies and Pitfalls examples."""
--- a/book/quarto/contents/vol2/backmatter/appendix_assumptions.qmd
+++ b/book/quarto/contents/vol2/backmatter/appendix_assumptions.qmd
@@ -80,6 +80,13 @@ def fmt_plain(val, unit_str=""):
    """Format a plain scalar with an explicit unit string."""
    return f"{val:g}", unit_str

+# --- H100 Recap ---
+h100_fp16_tflops_str = fmt(H100_FLOPS_FP16_TENSOR.m_as(TFLOPs/second), precision=0, commas=False)
+h100_fp8_tflops_str = fmt(H100_FLOPS_FP8_TENSOR.m_as(TFLOPs/second), precision=0, commas=True)
+h100_bw_tb_str = fmt(H100_MEM_BW.m_as(TB/second), precision=2, commas=False)
+h100_cap_gb_str = fmt(H100_MEM_CAPACITY.m_as(GB), precision=0, commas=False)
+h100_tdp_w_str = fmt(H100_TDP.m_as(watt), precision=0, commas=False)
+
 # --- Cluster Scale References ---
 CLUSTER_SMALL_GPUS_val, CLUSTER_SMALL_GPUS_unit = fmt_plain(constants.CLUSTER_SMALL_GPUS, "GPUs")
 CLUSTER_MEDIUM_GPUS_val, CLUSTER_MEDIUM_GPUS_unit = fmt_plain(constants.CLUSTER_MEDIUM_GPUS, "GPUs")
@@ -212,11 +219,11 @@ The distributed systems reasoning in this book builds upon the single-machine pe

 | **Tier**       | **Specification** | **Reference Value**              |
 |:---------------|:------------------|:---------------------------------|
-| **Compute**    | FP16 Throughput   | `{python} H100_FLOPS_FP16_TENSOR.m_as(TFLOPs/second):.0f` TFLOPS |
-| **Compute**    | FP8 Throughput    | `{python} H100_FLOPS_FP8_TENSOR.m_as(TFLOPs/second):,.0f` TFLOPS |
-| **Memory**     | HBM3 Bandwidth    | `{python} H100_MEM_BW.m_as(TB/second):.2f` TB/s          |
-| **Memory**     | HBM3 Capacity     | `{python} H100_MEM_CAPACITY.m_as(GB):.0f` GB             |
-| **Thermal**    | TDP               | `{python} H100_TDP.m_as(watt):.0f` W                     |
+| **Compute**    | FP16 Throughput   | `{python} h100_fp16_tflops_str` TFLOPS |
+| **Compute**    | FP8 Throughput    | `{python} h100_fp8_tflops_str` TFLOPS |
+| **Memory**     | HBM3 Bandwidth    | `{python} h100_bw_tb_str` TB/s          |
+| **Memory**     | HBM3 Capacity     | `{python} h100_cap_gb_str` GB             |
+| **Thermal**    | TDP               | `{python} h100_tdp_w_str` W                     |

 : **Single-Node Foundational Constants**. Recapping the hardware specifications for the H100 accelerator. These values provide the $R_{\text{peak}}$ and $BW$ baselines used in the Iron Law calculations throughout this volume. {#tbl-fleet-foundation-recap}

--- a/book/quarto/contents/vol2/backmatter/appendix_c3.qmd
+++ b/book/quarto/contents/vol2/backmatter/appendix_c3.qmd
@@ -52,7 +52,7 @@ from mlsys.constants import (
    GPU_MTTF_HOURS, GPUS_PER_HOST,
    INFINIBAND_NDR_BW_GBS, NVLINK_H100_BW
 )
-from mlsys.formatting import fmt, check, md, md_math
+from mlsys.formatting import fmt_percent, fmt, check, md, md_math
 from mlsys.formulas import calc_effective_flops

 class C3Taxonomy:
@@ -120,50 +120,50 @@ class C3Taxonomy:

    # Case 1
    case1_n_gpus_str = fmt(case1_n_gpus, precision=0)
-    case1_mfu_pct_str = fmt(case1_mfu * 100, precision=0, commas=False)
-    case1_scaling_eff_pct_str = fmt(case1_scaling_eff * 100, precision=0, commas=False)
-    case1_target_mfu_pct_str = fmt(MFU_TRAINING_HIGH * 100, precision=0, commas=False)
+    case1_mfu_pct_str = fmt_percent(case1_mfu, precision=0, commas=False)
+    case1_scaling_eff_pct_str = fmt_percent(case1_scaling_eff, precision=0, commas=False)
+    case1_target_mfu_pct_str = fmt_percent(MFU_TRAINING_HIGH, precision=0, commas=False)
    case1_throughput_ratio_str = fmt(case1_throughput_ratio, precision=2, commas=False)
    case1_wasted_pct_str = fmt(case1_wasted_pct, precision=0, commas=False)

    # Case 2
    case2_n_gpus_str = fmt(case2_n_gpus, precision=0)
-    case2_mfu_pct_str = fmt(case2_mfu * 100, precision=0, commas=False)
-    case2_comm_pct_str = fmt(case2_comm_fraction * 100, precision=0, commas=False)
-    case2_compute_pct_str = fmt(case2_compute_fraction * 100, precision=0, commas=False)
+    case2_mfu_pct_str = fmt_percent(case2_mfu, precision=0, commas=False)
+    case2_comm_pct_str = fmt_percent(case2_comm_fraction, precision=0, commas=False)
+    case2_compute_pct_str = fmt_percent(case2_compute_fraction, precision=0, commas=False)
    case2_speedup_str = fmt(case2_speedup_if_fixed, precision=1, commas=False)

    # Case 3
    case3_n_gpus_str = fmt(case3_n_gpus, precision=0)
-    case3_mfu_pct_str = fmt(case3_mfu * 100, precision=0, commas=False)
-    case3_comm_pct_str = fmt(case3_comm_fraction * 100, precision=0, commas=False)
-    case3_goodput_pct_str = fmt(case3_goodput_ratio * 100, precision=0, commas=False)
-    case3_coord_pct_str = fmt(case3_coord_fraction * 100, precision=0, commas=False)
+    case3_mfu_pct_str = fmt_percent(case3_mfu, precision=0, commas=False)
+    case3_comm_pct_str = fmt_percent(case3_comm_fraction, precision=0, commas=False)
+    case3_goodput_pct_str = fmt_percent(case3_goodput_ratio, precision=0, commas=False)
+    case3_coord_pct_str = fmt_percent(case3_coord_fraction, precision=0, commas=False)

    # Effective FLOPS
    peak_pflops_str = fmt(peak_pflops, precision=0)
    effective_pflops_str = fmt(effective_pflops, precision=0)
-    eff_fraction_pct_str = fmt(eff_fraction * 100, precision=1, commas=False)
+    eff_fraction_pct_str = fmt_percent(eff_fraction, precision=1, commas=False)
    c3_tax_str = fmt(c3_tax, precision=1, commas=False)
-    mfu_pct_str = fmt(MFU_TRAINING_HIGH * 100, precision=0, commas=False)
-    scaling_pct_str = fmt(SCALING_EFF_8192GPU * 100, precision=0, commas=False)
-    goodput_pct_str = fmt(goodput_all * 100, precision=0, commas=False)
+    mfu_pct_str = fmt_percent(MFU_TRAINING_HIGH, precision=0, commas=False)
+    scaling_pct_str = fmt_percent(SCALING_EFF_8192GPU, precision=0, commas=False)
+    goodput_pct_str = fmt_percent(goodput_all, precision=0, commas=False)

    # Overhead constants
-    oh_pipeline_str = fmt(OVERHEAD_PIPELINE_BUBBLE * 100, precision=0, commas=False)
-    oh_checkpoint_str = fmt(OVERHEAD_CHECKPOINT * 100, precision=0, commas=False)
-    oh_failure_str = fmt(OVERHEAD_FAILURE_RECOVERY * 100, precision=0, commas=False)
-    oh_maintenance_str = fmt(OVERHEAD_MAINTENANCE * 100, precision=0, commas=False)
+    oh_pipeline_str = fmt_percent(OVERHEAD_PIPELINE_BUBBLE, precision=0, commas=False)
+    oh_checkpoint_str = fmt_percent(OVERHEAD_CHECKPOINT, precision=0, commas=False)
+    oh_failure_str = fmt_percent(OVERHEAD_FAILURE_RECOVERY, precision=0, commas=False)
+    oh_maintenance_str = fmt_percent(OVERHEAD_MAINTENANCE, precision=0, commas=False)

    # Scaling efficiency constants
-    eff_32_str = fmt(SCALING_EFF_32GPU * 100, precision=0, commas=False)
-    eff_256_str = fmt(SCALING_EFF_256GPU * 100, precision=0, commas=False)
-    eff_1024_str = fmt(SCALING_EFF_1024GPU * 100, precision=0, commas=False)
-    eff_8192_str = fmt(SCALING_EFF_8192GPU * 100, precision=0, commas=False)
+    eff_32_str = fmt_percent(SCALING_EFF_32GPU, precision=0, commas=False)
+    eff_256_str = fmt_percent(SCALING_EFF_256GPU, precision=0, commas=False)
+    eff_1024_str = fmt_percent(SCALING_EFF_1024GPU, precision=0, commas=False)
+    eff_8192_str = fmt_percent(SCALING_EFF_8192GPU, precision=0, commas=False)

    # MFU constants
-    mfu_low_str = fmt(MFU_TRAINING_LOW * 100, precision=0, commas=False)
-    mfu_high_str = fmt(MFU_TRAINING_HIGH * 100, precision=0, commas=False)
+    mfu_low_str = fmt_percent(MFU_TRAINING_LOW, precision=0, commas=False)
+    mfu_high_str = fmt_percent(MFU_TRAINING_HIGH, precision=0, commas=False)

 # ┌── EXPORTS (Bridge to Text) ─────────────────────────────────────────────────
 C3 = C3Taxonomy
@@ -496,7 +496,7 @@ The fleet spends exactly half its time on useful computation. Breaking down the
 # └─────────────────────────────────────────────────────────────────────────────

 eff_frac_val = MFU_TRAINING_HIGH * SCALING_EFF_1024GPU * 0.85
-eff_frac_pct_str = fmt(eff_frac_val * 100, precision=1, commas=False)
+eff_frac_pct_str = fmt_percent(eff_frac_val, precision=1, commas=False)
 c3_tax_val = 1 / eff_frac_val
 c3_tax_factor_str = fmt(c3_tax_val, precision=1, commas=False)
 eff_frac_3dp_str = fmt(eff_frac_val, precision=3, commas=False)
--- a/book/quarto/contents/vol2/backmatter/appendix_fleet.qmd
+++ b/book/quarto/contents/vol2/backmatter/appendix_fleet.qmd
@@ -277,7 +277,7 @@ _eff_8192 = fmt(FF.eff_8192, precision=0, commas=False)
 _mtbf_8k_min = fmt(FF.mtbf_8192_min, precision=0, commas=False)
 _mtbf_100k_min = fmt(FF.mtbf_100k_min, precision=0, commas=False)
 _ckpt_175 = fmt(FF.ckpt_175b_gb, precision=0, commas=False)
-_goodput = fmt(FF.goodput_ratio * 100, precision=0, commas=False)
+_goodput = fmt_percent(FF.goodput_ratio, precision=0, commas=False)
 _mfu_lo = fmt(FF.mfu_low, precision=0, commas=False)
 _mfu_hi = fmt(FF.mfu_high, precision=0, commas=False)
 _rack_ai = fmt(FF.rack_ai, precision=0, commas=False)
@@ -501,7 +501,7 @@ At fleet scale, four categories of overhead consume wall-clock time that is not
 | **Failure recovery**    | ~`{python} FF.oh_failure`%     | Faster detection, elastic rescheduling   |
 | **Maintenance windows** | ~`{python} FF.oh_maintenance`% | Rolling upgrades, live migration         |

-: **Overhead Budgets for Fleet-Scale Training**: These are fractions of wall-clock time. At 10,000+ GPUs, failure recovery dominates. The compound effect is multiplicative: total goodput ratio $\approx (1 - 0.05)(1 - 0.03)(1 - 0.10)(1 - 0.05) \approx$ `{python} fmt(FF.goodput_ratio * 100, precision=0)`%. {#tbl-fleet-overhead-budgets}
+: **Overhead Budgets for Fleet-Scale Training**: These are fractions of wall-clock time. At 10,000+ GPUs, failure recovery dominates. The compound effect is multiplicative: total goodput ratio $\approx (1 - 0.05)(1 - 0.03)(1 - 0.10)(1 - 0.05) \approx$ `{python} fmt_percent(FF.goodput_ratio, precision=0)`%. {#tbl-fleet-overhead-budgets}

 #### Power and Sustainability Numbers {.unnumbered}

@@ -772,9 +772,9 @@ peak_str = fmt(FF.peak_1024, precision=0)
 eff_str = fmt(FF.eff_flops_1024, precision=0)
 # Effective % of peak (0.50 × 0.50 × 0.77 ≈ 19.25%); use fmt_percent to avoid display bugs
 eff_pct_str = fmt_percent(FF.eff_fraction, precision=1)
-goodput_pct_str = fmt(FF.goodput_ratio * 100, precision=0, commas=False)
-mfu_pct_str = fmt(MFU_TRAINING_HIGH * 100, precision=0, commas=False)
-scaling_pct_str = fmt(SCALING_EFF_1024GPU * 100, precision=0, commas=False)
+goodput_pct_str = fmt_percent(FF.goodput_ratio, precision=0, commas=False)
+mfu_pct_str = fmt_percent(MFU_TRAINING_HIGH, precision=0, commas=False)
+scaling_pct_str = fmt_percent(SCALING_EFF_1024GPU, precision=0, commas=False)

 mfu_fmt = fmt(MFU_TRAINING_HIGH, precision=2, commas=False)
 scaling_fmt = fmt(SCALING_EFF_1024GPU, precision=2, commas=False)
@@ -793,7 +793,7 @@ $$\text{Effective} = \text{Peak} \times \text{MFU} \times \eta_{\text{scaling}}

 `{python} effective_eq_math`

-The cluster delivers `{python} eff_pct_str`% of its peak FLOPS as useful training work. The remaining `{python} fmt(100 - FF.eff_fraction * 100, precision=0)`% is consumed by hardware underutilization (`{python} mfu_pct_str`% MFU), communication overhead (`{python} scaling_pct_str`% scaling efficiency), and operational losses (`{python} goodput_pct_str`% goodput ratio).
+The cluster delivers `{python} eff_pct_str`% of its peak FLOPS as useful training work. The remaining `{python} fmt_percent(1 - FF.eff_fraction, precision=0)`% is consumed by hardware underutilization (`{python} mfu_pct_str`% MFU), communication overhead (`{python} scaling_pct_str`% scaling efficiency), and operational losses (`{python} goodput_pct_str`% goodput ratio).

 This is not a failure of engineering---it is the physics of fleet-scale computation. Every additional GPU adds less marginal useful work, but the total throughput still far exceeds what a smaller cluster could achieve. The goal is not to reach 100% utilization; the goal is to deliver trained models faster than any smaller configuration could.

--- a/book/quarto/contents/vol2/distributed_training/distributed_training.qmd
+++ b/book/quarto/contents/vol2/distributed_training/distributed_training.qmd
@@ -23,7 +23,7 @@ from mlsys.constants import (
    GPT3_PARAMS, GB, second, Mparam, THOUSAND,
    SEC_PER_HOUR, SEC_PER_DAY, MILLION, TRILLION, BITS_PER_BYTE, TB
 )
-from mlsys.formatting import fmt, sci, check
+from mlsys.formatting import fmt_percent, fmt, sci, check

 start_chapter("vol2:distributed_training")
 ```
@@ -96,7 +96,7 @@ from mlsys.constants import (
    GB, second, Mparam, Tparam, THOUSAND,
    SEC_PER_HOUR, SEC_PER_DAY, MILLION, TRILLION, BITS_PER_BYTE, TB
 )
-from mlsys.formatting import fmt, sci, check
+from mlsys.formatting import fmt_percent, fmt, sci, check

 # ┌── LEGO ───────────────────────────────────────────────
 ## Why Distribution Is Necessary {#sec-distributed-training-systems-systems-multimachine-scaling-fundamentals-ff96}```
@@ -681,7 +681,7 @@ Modern distributed training frameworks handle this distribution automatically th
 # └─────────────────────────────────────────────────────────────────────────────
 from mlsys import Models, Applications
 from mlsys.constants import param, BILLION
-from mlsys.formatting import fmt
+from mlsys.formatting import fmt_percent, fmt

 class FrontierTrainingContext:
    """GPT-3 scale reference for distributed training."""
@@ -721,7 +721,7 @@ frontier_name = FrontierTrainingContext.frontier_name
 # └─────────────────────────────────────────────────────────────────────────────
 from mlsys import Models, Applications
 from mlsys.constants import param, BILLION
-from mlsys.formatting import fmt
+from mlsys.formatting import fmt_percent, fmt

 class FrontierTrainingContext:
    """GPT-3 scale reference for distributed training."""
@@ -959,7 +959,7 @@ from mlsys.constants import (
    NVLINK_H100_BW, INFINIBAND_HDR_BW, MILLION, BILLION, GB, byte, second,
    GPUS_PER_HOST, BITS_PER_BYTE
 )
-from mlsys.formatting import fmt, check
+from mlsys.formatting import fmt_percent, fmt, check

 # ┌── LEGO ───────────────────────────────────────────────
 class Scaling8GPU:
@@ -2779,7 +2779,7 @@ class YoungDaly:

    # ┌── 4. OUTPUT (Formatting) ──────────────────────────────────────────────
    t_opt_min_str = fmt(t_opt_min_val, precision=0)
-    loss_pct_str = fmt(total_overhead * 100, precision=1)
+    loss_pct_str = fmt_percent(total_overhead, precision=1)
    daily_savings_str = fmt(diff_daily, precision=0, commas=True)

 # ┌── EXPORTS (Bridge to Text) ─────────────────────────────────────────────────