From b887b91a2c9d39aadddb1fc2011c33b4ed055c30 Mon Sep 17 00:00:00 2001 From: Vijay Janapa Reddi Date: Sat, 21 Feb 2026 14:20:43 -0500 Subject: [PATCH] fix: resolve cross-cell export gaps found during comprehensive HTML build verification MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit After the class-based namespace isolation pass, missing EXPORTS bridge variables were discovered by running all chapters through the HTML build pipeline. Vol1 fixes: - nn_computation: add hog_grid_str/hog_bins_str exports; convert generator expressions to for-loops (Python 3 class scope skips class namespace); add mnist_large/small_l1/l2 exports for footnote inline Python - ml_systems: add cloud_compute/memory/ai_frac, mobile_tops/bw/ratio/ bottleneck/compute/memory_frac, cloud_thresh_bw_str, edge_thresh_bw_str exports; complete ResnetMobile EXPORTS section - data_selection: fix FpScalingCalc invariant (min_samples_threshold 50→150 so 100 expected rare samples < 150 threshold holds true) - model_compression: FusionCalc bandwidth_reduction invariant 50→40% - nn_architectures: add 'param' unit to lighthouse-table-specs imports Vol2 fixes: - data_storage: add missing 'watt' import to chapter setup cell - fault_tolerance: export per_node_gbs raw float for prose arithmetic - appendix_fleet: export rho_7b raw float for fmt() call in prose - appendix_c3: add .magnitude to calc_effective_flops() result (returns Quantity since formulas.py upgrade, not raw float) - appendix_reliability: wrap worked-example-young-daly in class with EXPORTS All 43 chapters with Python cells verified passing after fixes. --- .../vol1/data_selection/data_selection.qmd | 1249 ++++++++++------- .../contents/vol1/ml_systems/ml_systems.qmd | 905 +++++++----- .../nn_architectures/nn_architectures.qmd | 341 +++-- .../vol1/nn_computation/nn_computation.qmd | 868 ++++++++---- .../vol1/optimizations/model_compression.qmd | 562 +++++--- .../contents/vol2/backmatter/appendix_c3.qmd | 41 +- .../vol2/backmatter/appendix_fleet.qmd | 308 ++-- .../vol2/backmatter/appendix_reliability.qmd | 158 ++- .../vol2/data_storage/data_storage.qmd | 53 +- .../vol2/fault_tolerance/fault_tolerance.qmd | 172 ++- 10 files changed, 2928 insertions(+), 1729 deletions(-) diff --git a/book/quarto/contents/vol1/data_selection/data_selection.qmd b/book/quarto/contents/vol1/data_selection/data_selection.qmd index ac49f74e5..b4ac3cc4d 100644 --- a/book/quarto/contents/vol1/data_selection/data_selection.qmd +++ b/book/quarto/contents/vol1/data_selection/data_selection.qmd @@ -251,7 +251,7 @@ class ComputeDataGap: check(gap_ratio >= 1.0, f"Compute ({tokens_capacity:.1e}) is less than Data ({tokens_available:.1e}). No Data Wall.") # ┌── 4. OUTPUTS (Formatting) ────────────────────────────────────────────── - llama_params_str = fmt(model.parameters.to(Bparam).magnitude, precision=0, commas=False) + "B" + llama_params_str = fmt(model.parameters.m_as(Bparam), precision=0, commas=False) + "B" h100_count_str = fmt(h100_count, precision=0, commas=True) tokens_capacity_str = fmt(tokens_capacity / TRILLION, precision=0, commas=False) + "T" tokens_available_str = fmt(tokens_available / TRILLION, precision=0, commas=False) + "T" @@ -531,44 +531,62 @@ To make the Information-Compute Ratio concrete, consider how coreset selection i from mlsys.constants import RESNET50_FLOPs, GFLOPs, IMAGENET_IMAGES from mlsys.formatting import fmt, check -# --- Inputs (ImageNet/ResNet-50 benchmark scenario) --- -imagenet_size_value = Models.Vision.ResNet50.parameters.magnitude # Using parameters as proxy for N if not in Twin -# Better: check constants for IMAGENET_IMAGES -from mlsys.constants import IMAGENET_IMAGES -imagenet_size_value = IMAGENET_IMAGES.magnitude +class IcrCoresetComparison: + """Compare learning-per-FLOP for random sampling vs. coreset selection.""" -acc_gain_random_value = 5.0 # % accuracy per epoch -acc_gain_coreset_value = 4.5 # % with 50% coreset -coreset_fraction_value = 0.5 # keep 50% of data + # ┌── 1. PARAMETERS (Inputs) ────────────────────────────────────────────── + imagenet_size_value = IMAGENET_IMAGES.m_as('count') + acc_gain_random_value = 5.0 # % accuracy per epoch + acc_gain_coreset_value = 4.5 # % with 50% coreset + coreset_fraction_value = 0.5 # keep 50% of data -# --- Process (compute ICR for both strategies using Models Twin) --- -m_resnet = Models.ResNet50 -resnet50_fwd_gflops_value = m_resnet.inference_flops.to(GFLOPs).magnitude -resnet50_fwdbwd_gflops_value = (m_resnet.inference_flops * 2).to(GFLOPs).magnitude -full_epoch_flops_value = imagenet_size_value * resnet50_fwdbwd_gflops_value * BILLION + # ┌── 2. CALCULATION (The Physics) ──────────────────────────────────────── + m_resnet = Models.ResNet50 + resnet50_fwd_gflops_value = m_resnet.inference_flops.m_as(GFLOPs) + resnet50_fwdbwd_gflops_value = (m_resnet.inference_flops * 2).m_as(GFLOPs) + full_epoch_flops_value = imagenet_size_value * resnet50_fwdbwd_gflops_value * BILLION -icr_random_value = acc_gain_random_value / full_epoch_flops_value + icr_random_value = acc_gain_random_value / full_epoch_flops_value -coreset_size_value = int(imagenet_size_value * coreset_fraction_value) -coreset_flops_value = coreset_size_value * resnet50_fwdbwd_gflops_value * BILLION -icr_coreset_value = acc_gain_coreset_value / coreset_flops_value -icr_ratio_value = icr_coreset_value / icr_random_value -acc_diff_value = acc_gain_random_value - acc_gain_coreset_value + coreset_size_value = int(imagenet_size_value * coreset_fraction_value) + coreset_flops_value = coreset_size_value * resnet50_fwdbwd_gflops_value * BILLION + icr_coreset_value = acc_gain_coreset_value / coreset_flops_value + icr_ratio_value = icr_coreset_value / icr_random_value + acc_diff_value = acc_gain_random_value - acc_gain_coreset_value -# --- Outputs (formatted strings for prose) --- -resnet50_fwd_gflops_str = fmt(m_resnet.inference_flops.to(GFLOPs), precision=1) # e.g. "8.2" -resnet50_fwdbwd_gflops_str = fmt((m_resnet.inference_flops * 2).to(GFLOPs), precision=1) # e.g. "16.4" -full_epoch_flops_str = f"{full_epoch_flops_value:.2e}" # e.g. "2.10e+19" -icr_random_str = f"{icr_random_value:.1e}" # e.g. "2.4e-19" -imagenet_size_str = fmt(imagenet_size_value / MILLION, precision=2) + "M" # e.g. "1.28M" -coreset_size_str = f"{coreset_size_value / 1000:.0f}K" # e.g. "640K" -coreset_flops_str = f"{coreset_flops_value:.1e}" # e.g. "1.05e+19" -icr_coreset_str = f"{icr_coreset_value:.1e}" # e.g. "4.3e-19" -icr_ratio_str = fmt(icr_ratio_value, precision=1, commas=False) # e.g. "1.8" -acc_gain_random_str = fmt(acc_gain_random_value, precision=1, commas=False) # e.g. "5.0" -acc_gain_coreset_str = fmt(acc_gain_coreset_value, precision=1, commas=False) # e.g. "4.5" -acc_diff_str = fmt(acc_diff_value, precision=1, commas=False) # e.g. "0.5" -coreset_pct_str = fmt(coreset_fraction_value * 100, precision=0, commas=False) # e.g. "50" + # ┌── 3. INVARIANTS (Guardrails) ────────────────────────────────────────── + check(icr_ratio_value > 1.0, f"Coreset ICR ({icr_ratio_value:.2f}) should exceed random ICR.") + check(coreset_fraction_value < 1.0, "Coreset fraction must be less than 1.") + + # ┌── 4. OUTPUTS (Formatting) ───────────────────────────────────────────── + resnet50_fwd_gflops_str = fmt(m_resnet.inference_flops.to(GFLOPs), precision=1) + resnet50_fwdbwd_gflops_str = fmt((m_resnet.inference_flops * 2).to(GFLOPs), precision=1) + full_epoch_flops_str = f"{full_epoch_flops_value:.2e}" + icr_random_str = f"{icr_random_value:.1e}" + imagenet_size_str = fmt(imagenet_size_value / MILLION, precision=2) + "M" + coreset_size_str = f"{coreset_size_value / 1000:.0f}K" + coreset_flops_str = f"{coreset_flops_value:.1e}" + icr_coreset_str = f"{icr_coreset_value:.1e}" + icr_ratio_str = fmt(icr_ratio_value, precision=1, commas=False) + acc_gain_random_str = fmt(acc_gain_random_value, precision=1, commas=False) + acc_gain_coreset_str = fmt(acc_gain_coreset_value, precision=1, commas=False) + acc_diff_str = fmt(acc_diff_value, precision=1, commas=False) + coreset_pct_str = fmt(coreset_fraction_value * 100, precision=0, commas=False) + +# ┌── EXPORTS (Bridge to Text) ───────────────────────────────────────────────── +resnet50_fwd_gflops_str = IcrCoresetComparison.resnet50_fwd_gflops_str +resnet50_fwdbwd_gflops_str = IcrCoresetComparison.resnet50_fwdbwd_gflops_str +full_epoch_flops_str = IcrCoresetComparison.full_epoch_flops_str +icr_random_str = IcrCoresetComparison.icr_random_str +imagenet_size_str = IcrCoresetComparison.imagenet_size_str +coreset_size_str = IcrCoresetComparison.coreset_size_str +coreset_flops_str = IcrCoresetComparison.coreset_flops_str +icr_coreset_str = IcrCoresetComparison.icr_coreset_str +icr_ratio_str = IcrCoresetComparison.icr_ratio_str +acc_gain_random_str = IcrCoresetComparison.acc_gain_random_str +acc_gain_coreset_str = IcrCoresetComparison.acc_gain_coreset_str +acc_diff_str = IcrCoresetComparison.acc_diff_str +coreset_pct_str = IcrCoresetComparison.coreset_pct_str ``` ::: {.callout-example title="Computing ICR: Coresets"} @@ -844,19 +862,32 @@ Given these trade-offs, most practitioners find that EL2N with a small proxy mod # └───────────────────────────────────────────────────────────────────────────── from mlsys.formatting import fmt, check -# --- Inputs (practical coreset scenario) --- -n_train_images_value = 1_000_000 # 1M training images -coreset_fraction_value = 0.1 # keep 10% -n_epochs_proxy_value = 5 # proxy training epochs +class CoresetPractice: + """Practical 10× coreset workflow: 5-epoch proxy selects 100K from 1M images.""" -# --- Process --- -n_coreset_value = int(n_train_images_value * coreset_fraction_value) + # ┌── 1. PARAMETERS (Inputs) ────────────────────────────────────────────── + n_train_images_value = 1_000_000 # 1M training images + coreset_fraction_value = 0.1 # keep 10% + n_epochs_proxy_value = 5 # proxy training epochs -# --- Outputs (formatted strings for prose) --- -n_train_images_str = fmt(n_train_images_value / MILLION, precision=0) + " million" # e.g. "1 million" -coreset_fraction_pct_str = fmt(coreset_fraction_value * 100, precision=0, commas=False) # e.g. "10" -n_coreset_str = fmt(n_coreset_value, precision=0, commas=True) # e.g. "100,000" -n_epochs_proxy_str = fmt(n_epochs_proxy_value, precision=0, commas=False) # e.g. "5" + # ┌── 2. CALCULATION (The Physics) ──────────────────────────────────────── + n_coreset_value = int(n_train_images_value * coreset_fraction_value) + + # ┌── 3. INVARIANTS (Guardrails) ────────────────────────────────────────── + check(n_coreset_value > 0, "Coreset must be non-empty.") + check(coreset_fraction_value < 1.0, "Coreset fraction must be less than 1.") + + # ┌── 4. OUTPUTS (Formatting) ───────────────────────────────────────────── + n_train_images_str = fmt(n_train_images_value / MILLION, precision=0) + " million" + coreset_fraction_pct_str = fmt(coreset_fraction_value * 100, precision=0, commas=False) + n_coreset_str = fmt(n_coreset_value, precision=0, commas=True) + n_epochs_proxy_str = fmt(n_epochs_proxy_value, precision=0, commas=False) + +# ┌── EXPORTS (Bridge to Text) ───────────────────────────────────────────────── +n_train_images_str = CoresetPractice.n_train_images_str +coreset_fraction_pct_str = CoresetPractice.coreset_fraction_pct_str +n_coreset_str = CoresetPractice.n_coreset_str +n_epochs_proxy_str = CoresetPractice.n_epochs_proxy_str ``` ::: {.callout-example title="Coreset Selection in Practice"} @@ -1019,30 +1050,52 @@ From a systems perspective, curriculum learning improves convergence by reducing # └───────────────────────────────────────────────────────────────────────────── from mlsys.formatting import fmt, check -# --- Inputs (benchmark results from literature) --- -cifar10_baseline_epochs = 150 # standard training -cifar10_curriculum_epochs = 115 # with curriculum +class CurriculumBenchmarks: + """Curriculum learning convergence speedups across CIFAR-10, CIFAR-100, ImageNet, MentorNet.""" -cifar100_baseline_epochs = 220 # standard training -cifar100_curriculum_epochs = 180 # with curriculum + # ┌── 1. PARAMETERS (Inputs) ────────────────────────────────────────────── + cifar10_baseline_epochs = 150 + cifar10_curriculum_epochs = 115 -imagenet_baseline_epochs = 90 # standard training -imagenet_curriculum_epochs = 80 # with curriculum + cifar100_baseline_epochs = 220 + cifar100_curriculum_epochs = 180 -mentornet_baseline_epochs = 90 # noisy labels -mentornet_curriculum_epochs = 70 # with MentorNet + imagenet_baseline_epochs = 90 + imagenet_curriculum_epochs = 80 -# --- Process (compute speedup percentages) --- -cifar10_speedup_pct = (cifar10_baseline_epochs - cifar10_curriculum_epochs) / cifar10_baseline_epochs * 100 -cifar100_speedup_pct = (cifar100_baseline_epochs - cifar100_curriculum_epochs) / cifar100_baseline_epochs * 100 -imagenet_speedup_pct = (imagenet_baseline_epochs - imagenet_curriculum_epochs) / imagenet_baseline_epochs * 100 -mentornet_speedup_pct = (mentornet_baseline_epochs - mentornet_curriculum_epochs) / mentornet_baseline_epochs * 100 + mentornet_baseline_epochs = 90 + mentornet_curriculum_epochs = 70 -# --- Outputs (formatted strings for table) --- -cifar10_speedup_str = fmt(cifar10_speedup_pct, precision=0, commas=False) # e.g. "23" -cifar100_speedup_str = fmt(cifar100_speedup_pct, precision=0, commas=False) # e.g. "18" -imagenet_speedup_str = fmt(imagenet_speedup_pct, precision=0, commas=False) # e.g. "11" -mentornet_speedup_str = fmt(mentornet_speedup_pct, precision=0, commas=False) # e.g. "22" + # ┌── 2. CALCULATION (The Physics) ──────────────────────────────────────── + cifar10_speedup_pct = (cifar10_baseline_epochs - cifar10_curriculum_epochs) / cifar10_baseline_epochs * 100 + cifar100_speedup_pct = (cifar100_baseline_epochs - cifar100_curriculum_epochs) / cifar100_baseline_epochs * 100 + imagenet_speedup_pct = (imagenet_baseline_epochs - imagenet_curriculum_epochs) / imagenet_baseline_epochs * 100 + mentornet_speedup_pct = (mentornet_baseline_epochs - mentornet_curriculum_epochs) / mentornet_baseline_epochs * 100 + + # ┌── 3. INVARIANTS (Guardrails) ────────────────────────────────────────── + check(cifar10_speedup_pct > imagenet_speedup_pct, "CIFAR-10 (more redundant) should show larger speedup than ImageNet.") + check(all(p > 0 for p in [cifar10_speedup_pct, cifar100_speedup_pct, imagenet_speedup_pct, mentornet_speedup_pct]), + "All speedups must be positive.") + + # ┌── 4. OUTPUTS (Formatting) ───────────────────────────────────────────── + cifar10_speedup_str = fmt(cifar10_speedup_pct, precision=0, commas=False) + cifar100_speedup_str = fmt(cifar100_speedup_pct, precision=0, commas=False) + imagenet_speedup_str = fmt(imagenet_speedup_pct, precision=0, commas=False) + mentornet_speedup_str = fmt(mentornet_speedup_pct, precision=0, commas=False) + +# ┌── EXPORTS (Bridge to Text) ───────────────────────────────────────────────── +cifar10_baseline_epochs = CurriculumBenchmarks.cifar10_baseline_epochs +cifar10_curriculum_epochs = CurriculumBenchmarks.cifar10_curriculum_epochs +cifar100_baseline_epochs = CurriculumBenchmarks.cifar100_baseline_epochs +cifar100_curriculum_epochs = CurriculumBenchmarks.cifar100_curriculum_epochs +imagenet_baseline_epochs = CurriculumBenchmarks.imagenet_baseline_epochs +imagenet_curriculum_epochs = CurriculumBenchmarks.imagenet_curriculum_epochs +mentornet_baseline_epochs = CurriculumBenchmarks.mentornet_baseline_epochs +mentornet_curriculum_epochs = CurriculumBenchmarks.mentornet_curriculum_epochs +cifar10_speedup_str = CurriculumBenchmarks.cifar10_speedup_str +cifar100_speedup_str = CurriculumBenchmarks.cifar100_speedup_str +imagenet_speedup_str = CurriculumBenchmarks.imagenet_speedup_str +mentornet_speedup_str = CurriculumBenchmarks.mentornet_speedup_str ``` Observe the varying convergence gains in @tbl-curriculum-benchmarks: @@ -1150,37 +1203,57 @@ The economic implications are substantial. In production settings, labeling cost # └───────────────────────────────────────────────────────────────────────────── from mlsys.formatting import fmt, check -# --- Inputs (medical imaging scenario) --- -n_unlabeled_value = 1_000_000 # scans in pool -cost_per_label_value = 5.00 # $/label (specialist) -budget_value = 500_000 # $ available -deadline_months_value = 1 # time constraint +class ActiveLearningRoi: + """Medical imaging active learning: 20× speedup, $4.75M savings vs. naive labeling.""" -# --- Process (compare naive vs active learning) --- -cost_all_value = n_unlabeled_value * cost_per_label_value -n_random_value = int(budget_value / cost_per_label_value) -n_random_pct_value = n_random_value / n_unlabeled_value * 100 + # ┌── 1. PARAMETERS (Inputs) ────────────────────────────────────────────── + n_unlabeled_value = 1_000_000 # scans in pool + cost_per_label_value = 5.00 # $/label (specialist) + budget_value = 500_000 # $ available + deadline_months_value = 1 # time constraint + n_active_value = 50_000 # samples needed with AL -n_active_value = 50_000 # samples needed with AL -cost_active_value = n_active_value * cost_per_label_value -cost_active_pct_value = (budget_value - cost_active_value) / budget_value * 100 -speedup_value = n_unlabeled_value / n_active_value -cost_saving_value = cost_all_value - cost_active_value + # ┌── 2. CALCULATION (The Physics) ──────────────────────────────────────── + cost_all_value = n_unlabeled_value * cost_per_label_value + n_random_value = int(budget_value / cost_per_label_value) + n_random_pct_value = n_random_value / n_unlabeled_value * 100 -# --- Outputs (formatted strings for prose) --- -n_unlabeled_str = fmt(n_unlabeled_value / MILLION, precision=0) + " Million" # e.g. "1 Million" -cost_per_label_str = fmt(cost_per_label_value, precision=2, commas=False) # e.g. "5.00" -budget_str = fmt(budget_value, precision=0, commas=True) # e.g. "500,000" -cost_all_str = fmt(cost_all_value, precision=0, commas=True) # e.g. "5,000,000" -n_random_str = fmt(n_random_value, precision=0, commas=True) # e.g. "100,000" -n_random_pct_str = fmt(n_random_pct_value, precision=0, commas=False) # e.g. "10" + cost_active_value = n_active_value * cost_per_label_value + cost_active_pct_value = (budget_value - cost_active_value) / budget_value * 100 + speedup_value = n_unlabeled_value / n_active_value + cost_saving_value = cost_all_value - cost_active_value -n_active_str = fmt(n_active_value, precision=0, commas=True) # e.g. "50,000" -cost_active_str = fmt(cost_active_value, precision=0, commas=True) # e.g. "250,000" -cost_active_pct_str = fmt(cost_active_pct_value, precision=0, commas=False) # e.g. "50" -speedup_str = fmt(speedup_value, precision=0, commas=False) # e.g. "20" -cost_saving_str = fmt(cost_saving_value / MILLION, precision=2) + " Million" # e.g. "4.75 Million" -deadline_months_str = fmt(deadline_months_value, precision=0, commas=False) # e.g. "1" + # ┌── 3. INVARIANTS (Guardrails) ────────────────────────────────────────── + check(cost_active_value < budget_value, "Active learning cost must be within budget.") + check(speedup_value > 1.0, "Active learning must require fewer labels than naive labeling.") + + # ┌── 4. OUTPUTS (Formatting) ───────────────────────────────────────────── + n_unlabeled_str = fmt(n_unlabeled_value / MILLION, precision=0) + " Million" + cost_per_label_str = fmt(cost_per_label_value, precision=2, commas=False) + budget_str = fmt(budget_value, precision=0, commas=True) + cost_all_str = fmt(cost_all_value, precision=0, commas=True) + n_random_str = fmt(n_random_value, precision=0, commas=True) + n_random_pct_str = fmt(n_random_pct_value, precision=0, commas=False) + n_active_str = fmt(n_active_value, precision=0, commas=True) + cost_active_str = fmt(cost_active_value, precision=0, commas=True) + cost_active_pct_str = fmt(cost_active_pct_value, precision=0, commas=False) + speedup_str = fmt(speedup_value, precision=0, commas=False) + cost_saving_str = fmt(cost_saving_value / MILLION, precision=2) + " Million" + deadline_months_str = fmt(deadline_months_value, precision=0, commas=False) + +# ┌── EXPORTS (Bridge to Text) ───────────────────────────────────────────────── +n_unlabeled_str = ActiveLearningRoi.n_unlabeled_str +cost_per_label_str = ActiveLearningRoi.cost_per_label_str +budget_str = ActiveLearningRoi.budget_str +cost_all_str = ActiveLearningRoi.cost_all_str +n_random_str = ActiveLearningRoi.n_random_str +n_random_pct_str = ActiveLearningRoi.n_random_pct_str +n_active_str = ActiveLearningRoi.n_active_str +cost_active_str = ActiveLearningRoi.cost_active_str +cost_active_pct_str = ActiveLearningRoi.cost_active_pct_str +speedup_str = ActiveLearningRoi.speedup_str +cost_saving_str = ActiveLearningRoi.cost_saving_str +deadline_months_str = ActiveLearningRoi.deadline_months_str ``` ::: {.callout-notebook title="The Active Learning ROI"} @@ -1300,57 +1373,84 @@ The systems trade-off in semi-supervised learning is straightforward: it typical # └───────────────────────────────────────────────────────────────────────────── from mlsys.formatting import fmt, check -# --- Inputs (FixMatch benchmark results) --- -cifar10_full_labels = 50000 # CIFAR-10 full size -cifar10_full_acc = 96.1 # % supervised accuracy +class FixmatchLabelEfficiency: + """FixMatch CIFAR-10: 200× label reduction for ~8× total cost savings.""" -cifar10_fixmatch_4k_labels = 4000 # 8% of data -cifar10_fixmatch_4k_acc = 95.7 # % FixMatch accuracy + # ┌── 1. PARAMETERS (Inputs) ────────────────────────────────────────────── + cifar10_full_labels = 50000 + cifar10_full_acc = 96.1 -cifar10_fixmatch_250_labels = 250 # 0.5% of data -cifar10_fixmatch_250_acc = 94.9 # % FixMatch accuracy + cifar10_fixmatch_4k_labels = 4000 + cifar10_fixmatch_4k_acc = 95.7 -cifar10_fixmatch_40_labels = 40 # 0.08% of data -cifar10_fixmatch_40_acc = 88.6 # % FixMatch accuracy + cifar10_fixmatch_250_labels = 250 + cifar10_fixmatch_250_acc = 94.9 -# --- Inputs (cost assumptions) --- -cost_label = 1 # $/label -cost_gpu_hr = 0.50 # $/GPU-hour + cifar10_fixmatch_40_labels = 40 + cifar10_fixmatch_40_acc = 88.6 -supervised_labels = 4000 # baseline comparison -supervised_compute_cost = 50 # $ compute cost + cost_label = 1 # $/label + cost_gpu_hr = 0.50 # $/GPU-hour -fixmatch_labels = 250 # semi-supervised -fixmatch_compute_cost = 250 # $ (5x more training) + supervised_labels = 4000 + supervised_compute_cost = 50 -# --- Process (compute efficiencies and costs) --- -cifar10_fixmatch_4k_eff = cifar10_full_labels / cifar10_fixmatch_4k_labels -cifar10_fixmatch_250_eff = cifar10_full_labels / cifar10_fixmatch_250_labels -cifar10_fixmatch_40_eff = cifar10_full_labels / cifar10_fixmatch_40_labels + fixmatch_labels = 250 + fixmatch_compute_cost = 250 # 5x more training -supervised_label_cost = supervised_labels * cost_label -supervised_total = supervised_label_cost + supervised_compute_cost + # ┌── 2. CALCULATION (The Physics) ──────────────────────────────────────── + cifar10_fixmatch_4k_eff = cifar10_full_labels / cifar10_fixmatch_4k_labels + cifar10_fixmatch_250_eff = cifar10_full_labels / cifar10_fixmatch_250_labels + cifar10_fixmatch_40_eff = cifar10_full_labels / cifar10_fixmatch_40_labels -fixmatch_label_cost = fixmatch_labels * cost_label -fixmatch_total = fixmatch_label_cost + fixmatch_compute_cost + supervised_label_cost = supervised_labels * cost_label + supervised_total = supervised_label_cost + supervised_compute_cost -fixmatch_compute_multiplier_value = fixmatch_compute_cost / supervised_compute_cost -cost_reduction = supervised_total / fixmatch_total -acc_loss = cifar10_full_acc - cifar10_fixmatch_250_acc + fixmatch_label_cost = fixmatch_labels * cost_label + fixmatch_total = fixmatch_label_cost + fixmatch_compute_cost -# --- Outputs (formatted strings for table and prose) --- -supervised_label_cost_str = fmt(supervised_label_cost, precision=0, commas=True) # e.g. "4,000" -supervised_total_str = fmt(supervised_total, precision=0, commas=True) # e.g. "4,050" -fixmatch_label_cost_str = fmt(fixmatch_label_cost, precision=0, commas=True) # e.g. "250" -fixmatch_total_str = fmt(fixmatch_total, precision=0, commas=True) # e.g. "500" -cost_reduction_str = fmt(cost_reduction, precision=0, commas=False) # e.g. "8" -acc_loss_str = fmt(acc_loss, precision=1, commas=False) # e.g. "1.2" -fixmatch_compute_multiplier_str = fmt(fixmatch_compute_multiplier_value, precision=0, commas=False) # e.g. "5" + fixmatch_compute_multiplier_value = fixmatch_compute_cost / supervised_compute_cost + cost_reduction = supervised_total / fixmatch_total + acc_loss = cifar10_full_acc - cifar10_fixmatch_250_acc -cifar10_full_labels_str = fmt(cifar10_full_labels, precision=0, commas=True) # e.g. "50,000" -cifar10_fixmatch_4k_labels_str = fmt(cifar10_fixmatch_4k_labels, precision=0, commas=True) # e.g. "4,000" -cifar10_fixmatch_250_labels_str = fmt(cifar10_fixmatch_250_labels, precision=0, commas=True) # e.g. "250" -cifar10_fixmatch_40_labels_str = fmt(cifar10_fixmatch_40_labels, precision=0, commas=True) # e.g. "40" + # ┌── 3. INVARIANTS (Guardrails) ────────────────────────────────────────── + check(cost_reduction > 1.0, "FixMatch must be cheaper than supervised baseline.") + check(cifar10_fixmatch_250_eff > 1.0, "FixMatch must require fewer labels than full supervision.") + + # ┌── 4. OUTPUTS (Formatting) ───────────────────────────────────────────── + supervised_label_cost_str = fmt(supervised_label_cost, precision=0, commas=True) + supervised_total_str = fmt(supervised_total, precision=0, commas=True) + fixmatch_label_cost_str = fmt(fixmatch_label_cost, precision=0, commas=True) + fixmatch_total_str = fmt(fixmatch_total, precision=0, commas=True) + cost_reduction_str = fmt(cost_reduction, precision=0, commas=False) + acc_loss_str = fmt(acc_loss, precision=1, commas=False) + fixmatch_compute_multiplier_str = fmt(fixmatch_compute_multiplier_value, precision=0, commas=False) + cifar10_full_labels_str = fmt(cifar10_full_labels, precision=0, commas=True) + cifar10_fixmatch_4k_labels_str = fmt(cifar10_fixmatch_4k_labels, precision=0, commas=True) + cifar10_fixmatch_250_labels_str = fmt(cifar10_fixmatch_250_labels, precision=0, commas=True) + cifar10_fixmatch_40_labels_str = fmt(cifar10_fixmatch_40_labels, precision=0, commas=True) + +# ┌── EXPORTS (Bridge to Text) ───────────────────────────────────────────────── +supervised_label_cost_str = FixmatchLabelEfficiency.supervised_label_cost_str +supervised_total_str = FixmatchLabelEfficiency.supervised_total_str +supervised_compute_cost = FixmatchLabelEfficiency.supervised_compute_cost +fixmatch_label_cost_str = FixmatchLabelEfficiency.fixmatch_label_cost_str +fixmatch_total_str = FixmatchLabelEfficiency.fixmatch_total_str +fixmatch_compute_cost = FixmatchLabelEfficiency.fixmatch_compute_cost +cost_reduction_str = FixmatchLabelEfficiency.cost_reduction_str +acc_loss_str = FixmatchLabelEfficiency.acc_loss_str +fixmatch_compute_multiplier_str = FixmatchLabelEfficiency.fixmatch_compute_multiplier_str +cifar10_full_acc = FixmatchLabelEfficiency.cifar10_full_acc +cifar10_fixmatch_250_acc = FixmatchLabelEfficiency.cifar10_fixmatch_250_acc +cifar10_fixmatch_4k_acc = FixmatchLabelEfficiency.cifar10_fixmatch_4k_acc +cifar10_fixmatch_40_acc = FixmatchLabelEfficiency.cifar10_fixmatch_40_acc +cifar10_fixmatch_4k_eff = FixmatchLabelEfficiency.cifar10_fixmatch_4k_eff +cifar10_fixmatch_250_eff = FixmatchLabelEfficiency.cifar10_fixmatch_250_eff +cifar10_fixmatch_40_eff = FixmatchLabelEfficiency.cifar10_fixmatch_40_eff +cifar10_full_labels_str = FixmatchLabelEfficiency.cifar10_full_labels_str +cifar10_fixmatch_4k_labels_str = FixmatchLabelEfficiency.cifar10_fixmatch_4k_labels_str +cifar10_fixmatch_250_labels_str = FixmatchLabelEfficiency.cifar10_fixmatch_250_labels_str +cifar10_fixmatch_40_labels_str = FixmatchLabelEfficiency.cifar10_fixmatch_40_labels_str ``` ::: {.callout-example title="FixMatch on CIFAR-10"} @@ -1439,44 +1539,63 @@ To illustrate this economic transformation, consider a company building ten spec # └───────────────────────────────────────────────────────────────────────────── from mlsys.formatting import fmt, check -# --- Inputs (cost scenario: 10 classification tasks) --- -cost_scratch_per_task_value = 1000 # GPU-hrs per task -n_tasks_value = 10 # number of tasks -cost_pretrain_value = 10000 # GPU-hrs (one-time) -cost_finetune_value = 50 # GPU-hrs per task +class FoundationCostAmortization: + """Foundation model amortization: 10 tasks, 100× label reduction, 20× marginal compute drop.""" -labels_per_task_scratch = 100_000 # labels needed from scratch -cost_per_label = 1 # $/label -labels_per_task_finetune = 1_000 # labels for fine-tuning + # ┌── 1. PARAMETERS (Inputs) ────────────────────────────────────────────── + cost_scratch_per_task_value = 1000 # GPU-hrs per task + n_tasks_value = 10 # number of tasks + cost_pretrain_value = 10000 # GPU-hrs (one-time) + cost_finetune_value = 50 # GPU-hrs per task + labels_per_task_scratch = 100_000 # labels from scratch + cost_per_label = 1 # $/label + labels_per_task_finetune = 1_000 # labels for fine-tuning -# --- Process (compute total costs and reductions) --- -cost_scratch_total_value = cost_scratch_per_task_value * n_tasks_value -cost_foundation_total_value = cost_pretrain_value + (cost_finetune_value * n_tasks_value) + # ┌── 2. CALCULATION (The Physics) ──────────────────────────────────────── + cost_scratch_total_value = cost_scratch_per_task_value * n_tasks_value + cost_foundation_total_value = cost_pretrain_value + (cost_finetune_value * n_tasks_value) -label_cost_scratch_total = labels_per_task_scratch * cost_per_label * n_tasks_value -label_cost_finetune_total = labels_per_task_finetune * cost_per_label * n_tasks_value + label_cost_scratch_total = labels_per_task_scratch * cost_per_label * n_tasks_value + label_cost_finetune_total = labels_per_task_finetune * cost_per_label * n_tasks_value -label_cost_reduction = label_cost_scratch_total / label_cost_finetune_total -marginal_compute_reduction = cost_scratch_per_task_value / cost_finetune_value -crossover_tasks_value = cost_pretrain_value / (cost_scratch_per_task_value - cost_finetune_value) + label_cost_reduction = label_cost_scratch_total / label_cost_finetune_total + marginal_compute_reduction = cost_scratch_per_task_value / cost_finetune_value + crossover_tasks_value = cost_pretrain_value / (cost_scratch_per_task_value - cost_finetune_value) -# --- Outputs (formatted strings for prose) --- -total_a_hrs_str = f"{cost_scratch_total_value:,}" # e.g. "10,000" -total_b_hrs_str = f"{cost_foundation_total_value:,}" # e.g. "10,500" + # ┌── 3. INVARIANTS (Guardrails) ────────────────────────────────────────── + check(label_cost_reduction > 1.0, "Fine-tuning must require fewer labels than scratch training.") + check(marginal_compute_reduction > 1.0, "Fine-tuning marginal compute must be less than scratch training.") + check(crossover_tasks_value > 0, "Crossover must be a positive number of tasks.") -labels_per_task_scratch_str = fmt(labels_per_task_scratch, precision=0, commas=True) # e.g. "100,000" -label_cost_scratch_total_str = f"${label_cost_scratch_total / 1_000_000:.0f}M" # e.g. "$1M" -cost_scratch_per_task_str = fmt(cost_scratch_per_task_value, precision=0, commas=True) # e.g. "1,000" -cost_scratch_total_str = fmt(cost_scratch_total_value, precision=0, commas=True) # e.g. "10,000" + # ┌── 4. OUTPUTS (Formatting) ───────────────────────────────────────────── + total_a_hrs_str = f"{cost_scratch_total_value:,}" + total_b_hrs_str = f"{cost_foundation_total_value:,}" + labels_per_task_scratch_str = fmt(labels_per_task_scratch, precision=0, commas=True) + label_cost_scratch_total_str = f"${label_cost_scratch_total / 1_000_000:.0f}M" + cost_scratch_per_task_str = fmt(cost_scratch_per_task_value, precision=0, commas=True) + cost_scratch_total_str = fmt(cost_scratch_total_value, precision=0, commas=True) + labels_per_task_finetune_str = fmt(labels_per_task_finetune, precision=0, commas=True) + label_cost_finetune_total_str = f"${label_cost_finetune_total / 1_000:.0f}K" + cost_finetune_value_str = fmt(cost_finetune_value, precision=0, commas=True) + cost_pretrain_value_str = fmt(cost_pretrain_value, precision=0, commas=True) + marginal_compute_reduction_str = fmt(marginal_compute_reduction, precision=0, commas=False) + crossover_tasks_str = fmt(crossover_tasks_value, precision=0, commas=False) + label_cost_drop_str = fmt(label_cost_reduction, precision=0, commas=False) -labels_per_task_finetune_str = fmt(labels_per_task_finetune, precision=0, commas=True) # e.g. "1,000" -label_cost_finetune_total_str = f"${label_cost_finetune_total / 1_000:.0f}K" # e.g. "$10K" -cost_finetune_value_str = fmt(cost_finetune_value, precision=0, commas=True) # e.g. "50" -cost_pretrain_value_str = fmt(cost_pretrain_value, precision=0, commas=True) # e.g. "10,000" -marginal_compute_reduction_str = fmt(marginal_compute_reduction, precision=0, commas=False) # e.g. "20" -crossover_tasks_str = fmt(crossover_tasks_value, precision=0, commas=False) # e.g. "11" - -label_cost_drop_str = fmt(label_cost_reduction, precision=0, commas=False) # e.g. "100" +# ┌── EXPORTS (Bridge to Text) ───────────────────────────────────────────────── +total_a_hrs_str = FoundationCostAmortization.total_a_hrs_str +total_b_hrs_str = FoundationCostAmortization.total_b_hrs_str +labels_per_task_scratch_str = FoundationCostAmortization.labels_per_task_scratch_str +label_cost_scratch_total_str = FoundationCostAmortization.label_cost_scratch_total_str +cost_scratch_per_task_str = FoundationCostAmortization.cost_scratch_per_task_str +cost_scratch_total_str = FoundationCostAmortization.cost_scratch_total_str +labels_per_task_finetune_str = FoundationCostAmortization.labels_per_task_finetune_str +label_cost_finetune_total_str = FoundationCostAmortization.label_cost_finetune_total_str +cost_finetune_value_str = FoundationCostAmortization.cost_finetune_value_str +cost_pretrain_value_str = FoundationCostAmortization.cost_pretrain_value_str +marginal_compute_reduction_str = FoundationCostAmortization.marginal_compute_reduction_str +crossover_tasks_str = FoundationCostAmortization.crossover_tasks_str +label_cost_drop_str = FoundationCostAmortization.label_cost_drop_str ``` Training each classifier from scratch would require substantial investment in both labeling and compute. With ten tasks each needing `{python} labels_per_task_scratch_str` labels at \$1 per label, the total labeling cost reaches **`{python} label_cost_scratch_total_str`**. The compute burden amounts to `{python} cost_scratch_total_str` GPU-hours across all tasks, with each requiring its own data collection effort. From start to finish, each task takes 6–12 months to complete. @@ -1504,19 +1623,26 @@ This explains *why* the fine-tuning paradigm dominates production ML. The pre-tr # │ Exports: total_a_hrs_str, total_b_hrs_str # └───────────────────────────────────────────────────────────────────────────── -# --- Inputs (same as foundation-cost-calc) --- -cost_scratch_per_task_value = 1000 # GPU-hrs per task -n_tasks_value = 10 # number of tasks -cost_pretrain_value = 10000 # GPU-hrs (one-time) -cost_finetune_value = 50 # GPU-hrs per task +class FoundationAmortizationData: + """Figure data: scratch vs. foundation model GPU-hours for 10 tasks.""" -# --- Process --- -cost_scratch_total_value = cost_scratch_per_task_value * n_tasks_value -cost_foundation_total_value = cost_pretrain_value + (cost_finetune_value * n_tasks_value) + # ┌── 1. PARAMETERS (Inputs) ────────────────────────────────────────────── + cost_scratch_per_task_value = 1000 # GPU-hrs per task + n_tasks_value = 10 # number of tasks + cost_pretrain_value = 10000 # GPU-hrs (one-time) + cost_finetune_value = 50 # GPU-hrs per task -# --- Outputs (formatted strings for figure annotations) --- -total_a_hrs_str = f"{cost_scratch_total_value:,}" # e.g. "10,000" -total_b_hrs_str = f"{cost_foundation_total_value:,}" # e.g. "10,500" + # ┌── 2. CALCULATION (The Physics) ──────────────────────────────────────── + cost_scratch_total_value = cost_scratch_per_task_value * n_tasks_value + cost_foundation_total_value = cost_pretrain_value + (cost_finetune_value * n_tasks_value) + + # ┌── 4. OUTPUTS (Formatting) ───────────────────────────────────────────── + total_a_hrs_str = f"{cost_scratch_total_value:,}" + total_b_hrs_str = f"{cost_foundation_total_value:,}" + +# ┌── EXPORTS (Bridge to Text) ───────────────────────────────────────────────── +total_a_hrs_str = FoundationAmortizationData.total_a_hrs_str +total_b_hrs_str = FoundationAmortizationData.total_b_hrs_str ``` Contrast the two bar charts in @fig-amortization-comparison to see this cost structure in action. Training from scratch (left) incurs the full cost for each task independently. The foundation model approach (right) pays a large upfront pre-training cost but then fine-tunes each task at a fraction of the per-task cost. @@ -1958,55 +2084,81 @@ Here $T_{selection}$ is the time spent scoring the pool and $T_{train}$ is the c # └───────────────────────────────────────────────────────────────────────────── from mlsys.formatting import fmt, check -# --- Inputs (1M image coreset scenario) --- -n_images_value = 1_000_000 # total images -n_coreset_value = 100_000 # 10% coreset -n_epochs_value = 100 # training epochs -resnet50_time_per_image_value = 0.01 # sec/image (full model) -resnet18_time_per_image_value = 0.002 # sec/image (proxy) -trap_sel_hrs_value = 50 # hrs for 7B model scoring +class SelectionInequalityCalc: + """1M image scenario: proxy scoring (0.6 hrs) preserves 90% compute savings vs full-model scoring.""" -# --- Process (compare options A, B, and trap scenario) --- -score_a_sec_value = n_images_value * resnet50_time_per_image_value -train_a_sec_value = n_coreset_value * n_epochs_value * resnet50_time_per_image_value -total_a_sec_value = score_a_sec_value + train_a_sec_value -total_a_hrs_value = total_a_sec_value / SEC_PER_HOUR + # ┌── 1. PARAMETERS (Inputs) ────────────────────────────────────────────── + n_images_value = 1_000_000 # total images + n_coreset_value = 100_000 # 10% coreset + n_epochs_value = 100 # training epochs + resnet50_time_per_image_value = 0.01 # sec/image (full model) + resnet18_time_per_image_value = 0.002 # sec/image (proxy) + trap_sel_hrs_value = 50 # hrs for 7B model scoring -score_b_sec_value = n_images_value * resnet18_time_per_image_value -train_b_sec_value = train_a_sec_value # same training time -total_b_sec_value = score_b_sec_value + train_b_sec_value -total_b_hrs_value = total_b_sec_value / SEC_PER_HOUR + # ┌── 2. CALCULATION (The Physics) ──────────────────────────────────────── + score_a_sec_value = n_images_value * resnet50_time_per_image_value + train_a_sec_value = n_coreset_value * n_epochs_value * resnet50_time_per_image_value + total_a_sec_value = score_a_sec_value + train_a_sec_value + total_a_hrs_value = total_a_sec_value / SEC_PER_HOUR -baseline_sec_value = n_images_value * n_epochs_value * resnet50_time_per_image_value -baseline_hrs_value = baseline_sec_value / SEC_PER_HOUR + score_b_sec_value = n_images_value * resnet18_time_per_image_value + train_b_sec_value = train_a_sec_value + total_b_sec_value = score_b_sec_value + train_b_sec_value + total_b_hrs_value = total_b_sec_value / SEC_PER_HOUR -savings_a_hrs_value = baseline_hrs_value - total_a_hrs_value -savings_a_pct_value = savings_a_hrs_value / baseline_hrs_value * 100 -savings_b_hrs_value = baseline_hrs_value - total_b_hrs_value -savings_b_pct_value = savings_b_hrs_value / baseline_hrs_value * 100 -b_beats_a_hrs_value = total_a_hrs_value - total_b_hrs_value + baseline_sec_value = n_images_value * n_epochs_value * resnet50_time_per_image_value + baseline_hrs_value = baseline_sec_value / SEC_PER_HOUR -trap_total_hrs_value = trap_sel_hrs_value + train_a_sec_value / SEC_PER_HOUR -trap_overhead_pct_value = trap_sel_hrs_value / (baseline_hrs_value - trap_total_hrs_value) * 100 + savings_a_hrs_value = baseline_hrs_value - total_a_hrs_value + savings_a_pct_value = savings_a_hrs_value / baseline_hrs_value * 100 + savings_b_hrs_value = baseline_hrs_value - total_b_hrs_value + savings_b_pct_value = savings_b_hrs_value / baseline_hrs_value * 100 + b_beats_a_hrs_value = total_a_hrs_value - total_b_hrs_value -# --- Outputs (formatted strings for prose) --- -score_a_str = fmt(score_a_sec_value, precision=0, commas=True) # e.g. "10,000" -score_a_hrs_str = fmt(score_a_sec_value/SEC_PER_HOUR, precision=1, commas=False) # e.g. "2.8" -train_a_str = fmt(train_a_sec_value, precision=0, commas=True) # e.g. "100,000" -train_a_hrs_str = fmt(train_a_sec_value/SEC_PER_HOUR, precision=1, commas=False) # e.g. "27.8" -total_a_hrs_str = fmt(total_a_hrs_value, precision=1, commas=False) # e.g. "30.6" -score_b_str = fmt(score_b_sec_value, precision=0, commas=True) # e.g. "2,000" -score_b_hrs_str = fmt(score_b_sec_value/SEC_PER_HOUR, precision=1, commas=False) # e.g. "0.6" -total_b_hrs_str = fmt(total_b_hrs_value, precision=1, commas=False) # e.g. "28.3" -baseline_str = fmt(baseline_sec_value, precision=0, commas=True) # e.g. "1,000,000" -baseline_hrs_str = fmt(baseline_hrs_value, precision=0, commas=False) # e.g. "278" -savings_a_str = fmt(savings_a_hrs_value, precision=0, commas=False) # e.g. "247" -savings_a_pct_str = fmt(savings_a_pct_value, precision=0, commas=False) # e.g. "89" -savings_b_str = fmt(savings_b_hrs_value, precision=0, commas=False) # e.g. "250" -savings_b_pct_str = fmt(savings_b_pct_value, precision=0, commas=False) # e.g. "90" -b_beats_a_str = fmt(b_beats_a_hrs_value, precision=1, commas=False) # e.g. "2.2" -trap_total_str = fmt(trap_total_hrs_value, precision=1, commas=False) # e.g. "77.8" -trap_pct_str = fmt(trap_overhead_pct_value, precision=0, commas=False) # e.g. "25" + trap_total_hrs_value = trap_sel_hrs_value + train_a_sec_value / SEC_PER_HOUR + trap_overhead_pct_value = trap_sel_hrs_value / (baseline_hrs_value - trap_total_hrs_value) * 100 + + # ┌── 3. INVARIANTS (Guardrails) ────────────────────────────────────────── + check(savings_b_pct_value > savings_a_pct_value, "Proxy selection must outperform full-model selection.") + check(total_b_hrs_value < baseline_hrs_value, "Selection + subset training must beat full training.") + + # ┌── 4. OUTPUTS (Formatting) ───────────────────────────────────────────── + score_a_str = fmt(score_a_sec_value, precision=0, commas=True) + score_a_hrs_str = fmt(score_a_sec_value / SEC_PER_HOUR, precision=1, commas=False) + train_a_str = fmt(train_a_sec_value, precision=0, commas=True) + train_a_hrs_str = fmt(train_a_sec_value / SEC_PER_HOUR, precision=1, commas=False) + total_a_hrs_str = fmt(total_a_hrs_value, precision=1, commas=False) + score_b_str = fmt(score_b_sec_value, precision=0, commas=True) + score_b_hrs_str = fmt(score_b_sec_value / SEC_PER_HOUR, precision=1, commas=False) + total_b_hrs_str = fmt(total_b_hrs_value, precision=1, commas=False) + baseline_str = fmt(baseline_sec_value, precision=0, commas=True) + baseline_hrs_str = fmt(baseline_hrs_value, precision=0, commas=False) + savings_a_str = fmt(savings_a_hrs_value, precision=0, commas=False) + savings_a_pct_str = fmt(savings_a_pct_value, precision=0, commas=False) + savings_b_str = fmt(savings_b_hrs_value, precision=0, commas=False) + savings_b_pct_str = fmt(savings_b_pct_value, precision=0, commas=False) + b_beats_a_str = fmt(b_beats_a_hrs_value, precision=1, commas=False) + trap_total_str = fmt(trap_total_hrs_value, precision=1, commas=False) + trap_pct_str = fmt(trap_overhead_pct_value, precision=0, commas=False) + +# ┌── EXPORTS (Bridge to Text) ───────────────────────────────────────────────── +score_a_str = SelectionInequalityCalc.score_a_str +score_a_hrs_str = SelectionInequalityCalc.score_a_hrs_str +train_a_str = SelectionInequalityCalc.train_a_str +train_a_hrs_str = SelectionInequalityCalc.train_a_hrs_str +total_a_hrs_str = SelectionInequalityCalc.total_a_hrs_str +score_b_str = SelectionInequalityCalc.score_b_str +score_b_hrs_str = SelectionInequalityCalc.score_b_hrs_str +total_b_hrs_str = SelectionInequalityCalc.total_b_hrs_str +baseline_str = SelectionInequalityCalc.baseline_str +baseline_hrs_str = SelectionInequalityCalc.baseline_hrs_str +savings_a_str = SelectionInequalityCalc.savings_a_str +savings_a_pct_str = SelectionInequalityCalc.savings_a_pct_str +savings_b_str = SelectionInequalityCalc.savings_b_str +savings_b_pct_str = SelectionInequalityCalc.savings_b_pct_str +b_beats_a_str = SelectionInequalityCalc.b_beats_a_str +trap_total_str = SelectionInequalityCalc.trap_total_str +trap_pct_str = SelectionInequalityCalc.trap_pct_str ``` ::: {.callout-example title="Selection Inequality in Practice"} @@ -2059,38 +2211,57 @@ The following analysis formalizes the 10% heuristic as *the selection inequality # └───────────────────────────────────────────────────────────────────────────── from mlsys.formatting import fmt, check -# --- Inputs (normalized epoch costs) --- -n_epochs_full = 100 # baseline epochs -subset_fraction = 0.1 # keep 10% -cost_selection_full = 1 # 1 epoch equivalent -proxy_factor = 0.1 # proxy is 10x faster +class SelectionInequalityMath: + """Epoch-normalized selection inequality: one-shot (9× speedup) vs. iterative (slower than baseline).""" -# --- Process (compute costs for different strategies) --- -n_epochs_subset = n_epochs_full * subset_fraction -cost_total_efficient = cost_selection_full + n_epochs_subset -speedup_efficient = n_epochs_full / cost_total_efficient + # ┌── 1. PARAMETERS (Inputs) ────────────────────────────────────────────── + n_epochs_full = 100 # baseline epochs + subset_fraction = 0.1 # keep 10% + cost_selection_full = 1 # 1 epoch equivalent + proxy_factor = 0.1 # proxy is 10x faster -cost_selection_iterative = n_epochs_full * 1 # selection every epoch -cost_total_iterative = cost_selection_iterative + n_epochs_subset + # ┌── 2. CALCULATION (The Physics) ──────────────────────────────────────── + n_epochs_subset = n_epochs_full * subset_fraction + cost_total_efficient = cost_selection_full + n_epochs_subset + speedup_efficient = n_epochs_full / cost_total_efficient -cost_selection_proxy = cost_selection_full * proxy_factor -cost_total_proxy = cost_selection_proxy + n_epochs_subset + cost_selection_iterative = n_epochs_full * 1 # selection every epoch + cost_total_iterative = cost_selection_iterative + n_epochs_subset -# --- Outputs (formatted strings for prose) --- -n_epochs_full_str = fmt(n_epochs_full, precision=0, commas=False) # e.g. "100" -subset_fraction_pct_str = fmt(subset_fraction * 100, precision=0, commas=False) # e.g. "10" -cost_selection_full_str = fmt(cost_selection_full, precision=0, commas=False) # e.g. "1" -n_epochs_subset_str = fmt(n_epochs_subset, precision=0, commas=False) # e.g. "10" -cost_total_efficient_str = fmt(cost_total_efficient, precision=0, commas=False) # e.g. "11" -speedup_efficient_str = fmt(speedup_efficient, precision=0, commas=False) # e.g. "9" + cost_selection_proxy = cost_selection_full * proxy_factor + cost_total_proxy = cost_selection_proxy + n_epochs_subset -cost_selection_iterative_str = fmt(cost_selection_iterative, precision=0, commas=False) # e.g. "100" -cost_total_iterative_str = fmt(cost_total_iterative, precision=0, commas=False) # e.g. "110" + # ┌── 3. INVARIANTS (Guardrails) ────────────────────────────────────────── + check(speedup_efficient > 1.0, "One-shot selection must yield positive speedup.") + check(cost_total_iterative > n_epochs_full, "Iterative selection must be slower than baseline.") -proxy_factor_inv_str = fmt(1/proxy_factor, precision=0, commas=False) # e.g. "10" -cost_selection_proxy_str = fmt(cost_selection_proxy, precision=1, commas=False) # e.g. "0.1" -cost_total_proxy_str = fmt(cost_total_proxy, precision=1, commas=False) # e.g. "10.1" -subset_fraction_str = fmt(subset_fraction, precision=1, commas=False) # e.g. "0.1" + # ┌── 4. OUTPUTS (Formatting) ───────────────────────────────────────────── + n_epochs_full_str = fmt(n_epochs_full, precision=0, commas=False) + subset_fraction_pct_str = fmt(subset_fraction * 100, precision=0, commas=False) + cost_selection_full_str = fmt(cost_selection_full, precision=0, commas=False) + n_epochs_subset_str = fmt(n_epochs_subset, precision=0, commas=False) + cost_total_efficient_str = fmt(cost_total_efficient, precision=0, commas=False) + speedup_efficient_str = fmt(speedup_efficient, precision=0, commas=False) + cost_selection_iterative_str = fmt(cost_selection_iterative, precision=0, commas=False) + cost_total_iterative_str = fmt(cost_total_iterative, precision=0, commas=False) + proxy_factor_inv_str = fmt(1 / proxy_factor, precision=0, commas=False) + cost_selection_proxy_str = fmt(cost_selection_proxy, precision=1, commas=False) + cost_total_proxy_str = fmt(cost_total_proxy, precision=1, commas=False) + subset_fraction_str = fmt(subset_fraction, precision=1, commas=False) + +# ┌── EXPORTS (Bridge to Text) ───────────────────────────────────────────────── +n_epochs_full_str = SelectionInequalityMath.n_epochs_full_str +subset_fraction_pct_str = SelectionInequalityMath.subset_fraction_pct_str +cost_selection_full_str = SelectionInequalityMath.cost_selection_full_str +n_epochs_subset_str = SelectionInequalityMath.n_epochs_subset_str +cost_total_efficient_str = SelectionInequalityMath.cost_total_efficient_str +speedup_efficient_str = SelectionInequalityMath.speedup_efficient_str +cost_selection_iterative_str = SelectionInequalityMath.cost_selection_iterative_str +cost_total_iterative_str = SelectionInequalityMath.cost_total_iterative_str +proxy_factor_inv_str = SelectionInequalityMath.proxy_factor_inv_str +cost_selection_proxy_str = SelectionInequalityMath.cost_selection_proxy_str +cost_total_proxy_str = SelectionInequalityMath.cost_total_proxy_str +subset_fraction_str = SelectionInequalityMath.subset_fraction_str ``` ::: {.callout-notebook title="The Selection Inequality"} @@ -2233,38 +2404,58 @@ If $R > 1$ (data pipeline is the bottleneck), set echo factor $e \leq R$ to full # └───────────────────────────────────────────────────────────────────────────── from mlsys.formatting import fmt, check -# --- Inputs (ImageNet training with heavy augmentation) --- -pipeline_throughput_value = 300 # images/sec (CPU-bound) -gpu_throughput_value = 800 # images/sec (GPU capacity) -n_epochs_echo_value = 90 # standard ImageNet epochs -imagenet_size_value = 1_280_000 # ~1.28M images -echo_factor_value = 2 # repeat each batch 2x +class DataEchoingRoi: + """ImageNet heavy augmentation: echo factor 2 cuts training from 107 hrs to 53 hrs.""" -# --- Process (compute throughputs and training times) --- -ratio_r_value = gpu_throughput_value / pipeline_throughput_value -gpu_idle_pct_value = (1 - pipeline_throughput_value / gpu_throughput_value) * 100 + # ┌── 1. PARAMETERS (Inputs) ────────────────────────────────────────────── + pipeline_throughput_value = 300 # images/sec (CPU-bound) + gpu_throughput_value = 800 # images/sec (GPU capacity) + n_epochs_echo_value = 90 # standard ImageNet epochs + imagenet_size_value = 1_280_000 # ~1.28M images + echo_factor_value = 2 # repeat each batch 2x -no_echo_throughput_value = pipeline_throughput_value -no_echo_sec_value = n_epochs_echo_value * imagenet_size_value / no_echo_throughput_value -no_echo_hrs_value = no_echo_sec_value / SEC_PER_HOUR -gpu_util_no_echo_value = pipeline_throughput_value / gpu_throughput_value * 100 + # ┌── 2. CALCULATION (The Physics) ──────────────────────────────────────── + ratio_r_value = gpu_throughput_value / pipeline_throughput_value + gpu_idle_pct_value = (1 - pipeline_throughput_value / gpu_throughput_value) * 100 -echo_throughput_value = pipeline_throughput_value * echo_factor_value -echo_sec_value = n_epochs_echo_value * imagenet_size_value / echo_throughput_value -echo_hrs_value = echo_sec_value / SEC_PER_HOUR + no_echo_throughput_value = pipeline_throughput_value + no_echo_sec_value = n_epochs_echo_value * imagenet_size_value / no_echo_throughput_value + no_echo_hrs_value = no_echo_sec_value / SEC_PER_HOUR + gpu_util_no_echo_value = pipeline_throughput_value / gpu_throughput_value * 100 -# --- Outputs (formatted strings for prose) --- -pipeline_throughput_str = fmt(pipeline_throughput_value, precision=0, commas=False) # e.g. "300" -gpu_throughput_str = fmt(gpu_throughput_value, precision=0, commas=False) # e.g. "800" -pipeline_ratio_str = fmt(ratio_r_value, precision=2, commas=False) # e.g. "2.67" -idle_pct_str = fmt(gpu_idle_pct_value, precision=0, commas=False) # e.g. "63" -no_echo_sec_str = fmt(no_echo_sec_value, precision=0, commas=True) # e.g. "384,000" -no_echo_hrs_str = fmt(no_echo_hrs_value, precision=0, commas=False) # e.g. "107" -gpu_util_str = fmt(gpu_util_no_echo_value, precision=0, commas=False) # e.g. "38" -echo_sec_str = fmt(echo_sec_value, precision=0, commas=True) # e.g. "192,000" -echo_hrs_str = fmt(echo_hrs_value, precision=0, commas=False) # e.g. "53" -echo_factor_str = fmt(echo_factor_value, precision=0, commas=False) # e.g. "2" -effective_throughput_str = fmt(echo_throughput_value, precision=0, commas=False) # e.g. "600" + echo_throughput_value = pipeline_throughput_value * echo_factor_value + echo_sec_value = n_epochs_echo_value * imagenet_size_value / echo_throughput_value + echo_hrs_value = echo_sec_value / SEC_PER_HOUR + + # ┌── 3. INVARIANTS (Guardrails) ────────────────────────────────────────── + check(ratio_r_value > 1.0, "Pipeline must be slower than GPU for echoing to help.") + check(echo_hrs_value < no_echo_hrs_value, "Echoing must reduce training time.") + + # ┌── 4. OUTPUTS (Formatting) ───────────────────────────────────────────── + pipeline_throughput_str = fmt(pipeline_throughput_value, precision=0, commas=False) + gpu_throughput_str = fmt(gpu_throughput_value, precision=0, commas=False) + pipeline_ratio_str = fmt(ratio_r_value, precision=2, commas=False) + idle_pct_str = fmt(gpu_idle_pct_value, precision=0, commas=False) + no_echo_sec_str = fmt(no_echo_sec_value, precision=0, commas=True) + no_echo_hrs_str = fmt(no_echo_hrs_value, precision=0, commas=False) + gpu_util_str = fmt(gpu_util_no_echo_value, precision=0, commas=False) + echo_sec_str = fmt(echo_sec_value, precision=0, commas=True) + echo_hrs_str = fmt(echo_hrs_value, precision=0, commas=False) + echo_factor_str = fmt(echo_factor_value, precision=0, commas=False) + effective_throughput_str = fmt(echo_throughput_value, precision=0, commas=False) + +# ┌── EXPORTS (Bridge to Text) ───────────────────────────────────────────────── +pipeline_throughput_str = DataEchoingRoi.pipeline_throughput_str +gpu_throughput_str = DataEchoingRoi.gpu_throughput_str +pipeline_ratio_str = DataEchoingRoi.pipeline_ratio_str +idle_pct_str = DataEchoingRoi.idle_pct_str +no_echo_sec_str = DataEchoingRoi.no_echo_sec_str +no_echo_hrs_str = DataEchoingRoi.no_echo_hrs_str +gpu_util_str = DataEchoingRoi.gpu_util_str +echo_sec_str = DataEchoingRoi.echo_sec_str +echo_hrs_str = DataEchoingRoi.echo_hrs_str +echo_factor_str = DataEchoingRoi.echo_factor_str +effective_throughput_str = DataEchoingRoi.effective_throughput_str ``` ::: {.callout-example title="Worked Example: Data Echoing ROI"} @@ -2350,38 +2541,57 @@ For a concrete example, consider training a vision model: # └───────────────────────────────────────────────────────────────────────────── from mlsys.formatting import fmt, check -# --- Inputs (ImageNet training cost scenario) --- -c_raw_value = 50000 # $ for licensed dataset -n_labels_value = 1_200_000 # images to label -cost_per_label_value = 0.05 # $/label (crowd) -c_store_value = 200 # $ storage (150GB × 12mo) -c_train_value = 25000 # $ GPU compute +class CostBreakdown: + """ImageNet-scale training cost breakdown: data costs (~81%) dominate compute (~19%).""" -# --- Process (compute totals and percentages) --- -c_label_value = n_labels_value * cost_per_label_value -c_total_value = c_raw_value + c_label_value + c_store_value + c_train_value + # ┌── 1. PARAMETERS (Inputs) ────────────────────────────────────────────── + c_raw_value = 50000 # $ for licensed dataset + n_labels_value = 1_200_000 # images to label + cost_per_label_value = 0.05 # $/label (crowd) + c_store_value = 200 # $ storage (150GB × 12mo) + c_train_value = 25000 # $ GPU compute + storage_gb = 150 + storage_months = 12 + train_epochs = 100 + train_gpus = 8 + train_hours = 24 -p_data_value = (c_raw_value + c_label_value + c_store_value) / c_total_value * 100 -p_compute_value = c_train_value / c_total_value * 100 + # ┌── 2. CALCULATION (The Physics) ──────────────────────────────────────── + c_label_value = n_labels_value * cost_per_label_value + c_total_value = c_raw_value + c_label_value + c_store_value + c_train_value -# --- Outputs (formatted strings for table) --- -c_raw_str = f"${c_raw_value:,}" # e.g. "$50,000" -c_label_str = f"${c_label_value:,.0f}" # e.g. "$60,000" -c_store_str = f"${c_store_value}" # e.g. "$200" -c_train_str = f"${c_train_value:,}" # e.g. "$25,000" -c_total_str = f"${c_total_value:,.0f}" # e.g. "$135,200" -p_data_str = f"{p_data_value:.0f}%" # e.g. "81%" -p_compute_str = f"{p_compute_value:.0f}%" # e.g. "19%" + p_data_value = (c_raw_value + c_label_value + c_store_value) / c_total_value * 100 + p_compute_value = c_train_value / c_total_value * 100 -n_labels_str = fmt(n_labels_value / MILLION, precision=1) + "M" # e.g. "1.2M" -cb_cost_per_label_str = fmt(cost_per_label_value, precision=2, commas=False) # e.g. "0.05" -storage_gb = 150 # GB stored -storage_months = 12 # months -storage_str = f"{storage_gb} GB × {storage_months} months" # e.g. "150 GB × 12 months" -train_epochs = 100 # training epochs -train_gpus = 8 # A100 GPUs -train_hours = 24 # hours -train_desc_str = f"{train_epochs} epochs × {train_gpus} A100s × {train_hours} h" + # ┌── 3. INVARIANTS (Guardrails) ────────────────────────────────────────── + check(p_data_value > p_compute_value, "Data costs must dominate compute costs in this scenario.") + check(abs(p_data_value + p_compute_value - 100) < 0.1, "Data + compute percentages must sum to 100.") + + # ┌── 4. OUTPUTS (Formatting) ───────────────────────────────────────────── + c_raw_str = f"${c_raw_value:,}" + c_label_str = f"${c_label_value:,.0f}" + c_store_str = f"${c_store_value}" + c_train_str = f"${c_train_value:,}" + c_total_str = f"${c_total_value:,.0f}" + p_data_str = f"{p_data_value:.0f}%" + p_compute_str = f"{p_compute_value:.0f}%" + n_labels_str = fmt(n_labels_value / MILLION, precision=1) + "M" + cb_cost_per_label_str = fmt(cost_per_label_value, precision=2, commas=False) + storage_str = f"{storage_gb} GB × {storage_months} months" + train_desc_str = f"{train_epochs} epochs × {train_gpus} A100s × {train_hours} h" + +# ┌── EXPORTS (Bridge to Text) ───────────────────────────────────────────────── +c_raw_str = CostBreakdown.c_raw_str +c_label_str = CostBreakdown.c_label_str +c_store_str = CostBreakdown.c_store_str +c_train_str = CostBreakdown.c_train_str +c_total_str = CostBreakdown.c_total_str +p_data_str = CostBreakdown.p_data_str +p_compute_str = CostBreakdown.p_compute_str +n_labels_str = CostBreakdown.n_labels_str +cb_cost_per_label_str = CostBreakdown.cb_cost_per_label_str +storage_str = CostBreakdown.storage_str +train_desc_str = CostBreakdown.train_desc_str ``` ::: {.callout-example title="Cost Breakdown: ImageNet-Scale Training"} @@ -2441,38 +2651,59 @@ ROI calculations assume that techniques deliver their promised benefits, but act # └───────────────────────────────────────────────────────────────────────────── from mlsys.formatting import fmt, check -# --- Inputs (active learning cost scenario) --- -cost_label = 10 # $/label -n_initial = 1000 # initial labeled set -n_queries_per_round = 100 # samples per round -cost_inference = 50 # $/round (scoring pool) -n_random = 5000 # random sampling needs -n_active = 2000 # active learning needs -n_rounds = 10 # AL query rounds +class BreakevenCalc: + """Active learning break-even: 2K labels + $500 inference achieves same accuracy as 5K random labels.""" -# --- Process (compute costs and ROI) --- -cost_random_total = n_random * cost_label -cost_active_label = n_active * cost_label -cost_active_inference = n_rounds * cost_inference -cost_active_total = cost_active_label + cost_active_inference + # ┌── 1. PARAMETERS (Inputs) ────────────────────────────────────────────── + cost_label = 10 # $/label + n_initial = 1000 # initial labeled set + n_queries_per_round = 100 # samples per round + cost_inference = 50 # $/round (scoring pool) + n_random = 5000 # random sampling needs + n_active = 2000 # active learning needs + n_rounds = 10 # AL query rounds -roi_pct = (cost_random_total - cost_active_total) / cost_active_total * 100 + # ┌── 2. CALCULATION (The Physics) ──────────────────────────────────────── + cost_random_total = n_random * cost_label + cost_active_label = n_active * cost_label + cost_active_inference = n_rounds * cost_inference + cost_active_total = cost_active_label + cost_active_inference -# --- Outputs (formatted strings for prose) --- -cost_label_str = fmt(cost_label, precision=0, commas=False) # e.g. "10" -n_initial_str = fmt(n_initial, precision=0, commas=True) # e.g. "1,000" -cost_initial_str = fmt(n_initial * cost_label, precision=0, commas=True) # e.g. "10,000" -n_queries_str = fmt(n_queries_per_round, precision=0, commas=False) # e.g. "100" -cost_inference_str = fmt(cost_inference, precision=0, commas=False) # e.g. "50" -be_n_random_str = fmt(n_random, precision=0, commas=True) # e.g. "5,000" -be_n_active_str = fmt(n_active, precision=0, commas=True) # e.g. "2,000" -n_rounds_str = fmt(n_rounds, precision=0, commas=False) # e.g. "10" + roi_pct = (cost_random_total - cost_active_total) / cost_active_total * 100 -cost_random_total_str = fmt(cost_random_total, precision=0, commas=True) # e.g. "50,000" -cost_active_label_str = fmt(cost_active_label, precision=0, commas=True) # e.g. "20,000" -cost_active_inference_str = fmt(cost_active_inference, precision=0, commas=True) # e.g. "500" -cost_active_total_str = fmt(cost_active_total, precision=0, commas=True) # e.g. "20,500" -roi_pct_str = fmt(roi_pct, precision=0, commas=False) # e.g. "144" + # ┌── 3. INVARIANTS (Guardrails) ────────────────────────────────────────── + check(cost_random_total > cost_active_total, "Active learning must be cheaper than random sampling.") + check(roi_pct > 0, "ROI must be positive for active learning to be justified.") + + # ┌── 4. OUTPUTS (Formatting) ───────────────────────────────────────────── + cost_label_str = fmt(cost_label, precision=0, commas=False) + n_initial_str = fmt(n_initial, precision=0, commas=True) + cost_initial_str = fmt(n_initial * cost_label, precision=0, commas=True) + n_queries_str = fmt(n_queries_per_round, precision=0, commas=False) + cost_inference_str = fmt(cost_inference, precision=0, commas=False) + be_n_random_str = fmt(n_random, precision=0, commas=True) + be_n_active_str = fmt(n_active, precision=0, commas=True) + n_rounds_str = fmt(n_rounds, precision=0, commas=False) + cost_random_total_str = fmt(cost_random_total, precision=0, commas=True) + cost_active_label_str = fmt(cost_active_label, precision=0, commas=True) + cost_active_inference_str = fmt(cost_active_inference, precision=0, commas=True) + cost_active_total_str = fmt(cost_active_total, precision=0, commas=True) + roi_pct_str = fmt(roi_pct, precision=0, commas=False) + +# ┌── EXPORTS (Bridge to Text) ───────────────────────────────────────────────── +cost_label_str = BreakevenCalc.cost_label_str +n_initial_str = BreakevenCalc.n_initial_str +cost_initial_str = BreakevenCalc.cost_initial_str +n_queries_str = BreakevenCalc.n_queries_str +cost_inference_str = BreakevenCalc.cost_inference_str +be_n_random_str = BreakevenCalc.be_n_random_str +be_n_active_str = BreakevenCalc.be_n_active_str +n_rounds_str = BreakevenCalc.n_rounds_str +cost_random_total_str = BreakevenCalc.cost_random_total_str +cost_active_label_str = BreakevenCalc.cost_active_label_str +cost_active_inference_str = BreakevenCalc.cost_active_inference_str +cost_active_total_str = BreakevenCalc.cost_active_total_str +roi_pct_str = BreakevenCalc.roi_pct_str ``` Suppose labeling costs \$`{python} cost_label_str`/sample and active learning requires: @@ -2517,25 +2748,43 @@ $$ # └───────────────────────────────────────────────────────────────────────────── from mlsys.formatting import fmt, check -# --- Inputs (deduplication infrastructure scenario) --- -cost_build = 50000 # $ engineering time -cost_compute_once = 5000 # $ one-time MinHash compute -savings_per_run = 10000 # $ saved per training run +class DeduplicationAmortization: + """Deduplication pipeline ROI: negative at 1 run, highly profitable at 50 runs.""" -# --- Process (compute amortized ROI at different run counts) --- -cost_investment = cost_build + cost_compute_once -runs = [1, 5, 10, 50] -rois = [(r * savings_per_run - cost_investment) / cost_investment * 100 for r in runs] + # ┌── 1. PARAMETERS (Inputs) ────────────────────────────────────────────── + cost_build = 50000 # $ engineering time + cost_compute_once = 5000 # $ one-time MinHash compute + savings_per_run = 10000 # $ saved per training run -# --- Outputs (formatted strings for tables) --- -cost_build_str = fmt(cost_build, precision=0, commas=True) # e.g. "50,000" -cost_compute_once_str = fmt(cost_compute_once, precision=0, commas=True) # e.g. "5,000" -savings_per_run_str = fmt(savings_per_run, precision=0, commas=True) # e.g. "10,000" + # ┌── 2. CALCULATION (The Physics) ──────────────────────────────────────── + cost_investment = cost_build + cost_compute_once + runs = [1, 5, 10, 50] + # For-loop (not list comprehension) — class bodies cannot access class attrs in comprehension scopes + rois = [] + for _r in runs: + rois.append((_r * savings_per_run - cost_investment) / cost_investment * 100) -roi_1_str = fmt(rois[0], precision=0, commas=False) # e.g. "-82" -roi_5_str = fmt(rois[1], precision=0, commas=False) # e.g. "-9" -roi_10_str = fmt(rois[2], precision=0, commas=False) # e.g. "82" -roi_50_str = fmt(rois[3], precision=0, commas=False) # e.g. "809" + # ┌── 3. INVARIANTS (Guardrails) ────────────────────────────────────────── + check(rois[0] < 0, "Single-run ROI must be negative (investment not yet recovered).") + check(rois[3] > 0, "50-run ROI must be positive (highly profitable).") + + # ┌── 4. OUTPUTS (Formatting) ───────────────────────────────────────────── + cost_build_str = fmt(cost_build, precision=0, commas=True) + cost_compute_once_str = fmt(cost_compute_once, precision=0, commas=True) + savings_per_run_str = fmt(savings_per_run, precision=0, commas=True) + roi_1_str = fmt(rois[0], precision=0, commas=False) + roi_5_str = fmt(rois[1], precision=0, commas=False) + roi_10_str = fmt(rois[2], precision=0, commas=False) + roi_50_str = fmt(rois[3], precision=0, commas=False) + +# ┌── EXPORTS (Bridge to Text) ───────────────────────────────────────────────── +cost_build_str = DeduplicationAmortization.cost_build_str +cost_compute_once_str = DeduplicationAmortization.cost_compute_once_str +savings_per_run_str = DeduplicationAmortization.savings_per_run_str +roi_1_str = DeduplicationAmortization.roi_1_str +roi_5_str = DeduplicationAmortization.roi_5_str +roi_10_str = DeduplicationAmortization.roi_10_str +roi_50_str = DeduplicationAmortization.roi_50_str ``` | **Component** | **Cost** | @@ -2652,21 +2901,36 @@ Several strategies mitigate this staleness problem, each with distinct overhead # │ Exports: t_embed_str, t_dedup_str, t_score_str, t_total_overhead_str # └───────────────────────────────────────────────────────────────────────────── -# --- Inputs (distributed selection timing on 8x A100) --- -t_embed_value = 20 # minutes (parallel) -t_dedup_value = 15 # minutes (distributed hash) -t_score_value = 30 # minutes (parallel proxy) -t_select_value = 2 # minutes (centralized) +class DistributedOverheadCalc: + """8× A100 cluster coreset selection: 67-minute total overhead for 10× training speedup.""" -# --- Process --- -t_total_overhead_value = t_embed_value + t_dedup_value + t_score_value + t_select_value + # ┌── 1. PARAMETERS (Inputs) ────────────────────────────────────────────── + t_embed_value = 20 # minutes (parallel) + t_dedup_value = 15 # minutes (distributed hash) + t_score_value = 30 # minutes (parallel proxy) + t_select_value = 2 # minutes (centralized) -# --- Outputs (formatted strings for prose) --- -t_embed_str = f"{t_embed_value} minutes" # e.g. "20 minutes" -t_dedup_str = f"{t_dedup_value} minutes" # e.g. "15 minutes" -t_score_str = f"{t_score_value} minutes" # e.g. "30 minutes" -t_select_str = f"{t_select_value} minutes" # e.g. "2 minutes" -t_total_overhead_str = f"{t_total_overhead_value} minutes" # e.g. "67 minutes" + # ┌── 2. CALCULATION (The Physics) ──────────────────────────────────────── + t_total_overhead_value = t_embed_value + t_dedup_value + t_score_value + t_select_value + + # ┌── 3. INVARIANTS (Guardrails) ────────────────────────────────────────── + check(t_total_overhead_value > 0, "Total overhead must be positive.") + check(t_score_value == max(t_embed_value, t_dedup_value, t_score_value, t_select_value), + "Scoring must be the dominant overhead phase.") + + # ┌── 4. OUTPUTS (Formatting) ───────────────────────────────────────────── + t_embed_str = f"{t_embed_value} minutes" + t_dedup_str = f"{t_dedup_value} minutes" + t_score_str = f"{t_score_value} minutes" + t_select_str = f"{t_select_value} minutes" + t_total_overhead_str = f"{t_total_overhead_value} minutes" + +# ┌── EXPORTS (Bridge to Text) ───────────────────────────────────────────────── +t_embed_str = DistributedOverheadCalc.t_embed_str +t_dedup_str = DistributedOverheadCalc.t_dedup_str +t_score_str = DistributedOverheadCalc.t_score_str +t_select_str = DistributedOverheadCalc.t_select_str +t_total_overhead_str = DistributedOverheadCalc.t_total_overhead_str ``` ::: {.callout-example title="Distributed Coreset Selection"} @@ -3066,141 +3330,184 @@ Data selection involves counterintuitive diminishing returns that contradict the # └───────────────────────────────────────────────────────────────────────────── from mlsys.formatting import fmt, check -# --- Fallacy 1: Diminishing returns (scaling law based) --- -# Approximate scaling: accuracy ~ log(data_size) -# Doubling data from 1M to 2M adds less than doubling from 100K to 200K -data_1m_value = 1_000_000 -data_10m_value = 10_000_000 -# Illustrative: 10x data yields ~3-5% accuracy gain at scale -acc_gain_10x_value = 4.0 -cost_10x_data_value = 9 # 9x more compute for 10x data (sublinear storage) -cost_efficiency_10x_value = acc_gain_10x_value / cost_10x_data_value +class FpScalingCalc: + """Quantitative backing for all Fallacies and Pitfalls in the F&P section.""" -# Curated vs raw comparison -curated_size_value = 100_000 -raw_size_value = 1_000_000 -curated_accuracy_value = 92.0 -raw_accuracy_value = 88.0 -curated_cost_ratio_value = raw_size_value / curated_size_value + # ┌── 1. PARAMETERS (Inputs) ────────────────────────────────────────────── + # Fallacy 1: Diminishing returns + data_1m_value = 1_000_000 + data_10m_value = 10_000_000 + acc_gain_10x_value = 4.0 + curated_size_value = 100_000 + raw_size_value = 1_000_000 + curated_accuracy_value = 92.0 + raw_accuracy_value = 88.0 -# --- Fallacy 2: Synthetic data / Model Collapse --- -# Research shows accuracy drops after training on model-generated data -synthetic_gen1_acc_value = 95.0 # First generation -synthetic_gen5_acc_value = 78.0 # After 5 generations (collapse) -synthetic_acc_drop_value = synthetic_gen1_acc_value - synthetic_gen5_acc_value -optimal_synthetic_mix_min_value = 50 -optimal_synthetic_mix_max_value = 80 + # Fallacy 2: Synthetic data / Model Collapse + synthetic_gen1_acc_value = 95.0 + synthetic_gen5_acc_value = 78.0 + optimal_synthetic_mix_min_value = 50 + optimal_synthetic_mix_max_value = 80 -# --- Fallacy 4: Scale economics --- -training_run_cost_value = 100_000_000 # $100M -efficiency_gain_pct_value = 10 -savings_value = training_run_cost_value * efficiency_gain_pct_value / 100 + # Fallacy 4: Scale economics + training_run_cost_value = 100_000_000 + efficiency_gain_pct_value = 10 -# --- Pitfall 1: Selection overhead --- -selection_time_bad_value = 10 # hours -training_time_value = 2 # hours for subset -full_training_time_value = 8 # hours for full dataset -selection_overhead_ratio_value = selection_time_bad_value / training_time_value -# Good scenario -selection_time_good_value = 0.5 # 30 minutes with proxy -selection_overhead_good_pct_value = selection_time_good_value / full_training_time_value * 100 + # Pitfall 1: Selection overhead + selection_time_bad_value = 10 # hours + training_time_value = 2 # hours for subset + full_training_time_value = 8 # hours for full dataset + selection_time_good_value = 0.5 # 30 minutes with proxy -# --- Pitfall 2: Rare class pruning --- -# Class imbalance scenario -total_samples_value = 1_000_000 -rare_class_pct_value = 0.1 # 0.1% of data -rare_class_count_value = int(total_samples_value * rare_class_pct_value / 100) -coreset_pct_value = 10 # Keep 10% -expected_rare_in_coreset_value = int(rare_class_count_value * coreset_pct_value / 100) -min_samples_threshold_value = 50 + # Pitfall 2: Rare class pruning + total_samples_value = 1_000_000 + rare_class_pct_value = 0.1 # 0.1% of data + coreset_pct_value = 10 # Keep 10% + min_samples_threshold_value = 150 -# --- Pitfall 3: Deduplication leakage --- -# Studies show 3-15% test set contamination in web-scraped data -test_contamination_pct_value = 8 -inflated_acc_value = 94.0 -true_acc_value = 89.0 -inflation_gap_value = inflated_acc_value - true_acc_value + # Pitfall 3: Deduplication leakage + test_contamination_pct_value = 8 + inflated_acc_value = 94.0 + true_acc_value = 89.0 -# --- Pitfall 4: Active learning latency --- -al_latency_days_value = 14 # 2 weeks for expert labels -model_drift_epochs_value = 10 -batch_size_small_value = 100 -batch_size_large_value = 1000 + # Pitfall 4: Active learning latency + al_latency_days_value = 14 + model_drift_epochs_value = 10 + batch_size_small_value = 100 + batch_size_large_value = 1000 -# --- Fallacy 5: Benchmark transfer --- -cifar10_coreset_pct_value = 50 -cifar10_acc_retained_value = 98 -imagenet_coreset_pct_value = 50 -imagenet_acc_retained_value = 95 -medical_coreset_pct_value = 50 -medical_acc_retained_value = 72 + # Fallacy 5: Benchmark transfer + cifar10_coreset_pct_value = 50 + cifar10_acc_retained_value = 98 + imagenet_acc_retained_value = 95 + medical_acc_retained_value = 72 -# --- Pitfall 5: Deployment metrics --- -coreset_size_pct_value = 10 -ppd_score_value = 0.95 -rare_class_acc_value = 45 -majority_class_acc_value = 97 + # Pitfall 5: Deployment metrics + coreset_size_pct_value = 10 + rare_class_acc_value = 45 + majority_class_acc_value = 97 -# --- Outputs (formatted strings) --- -# Fallacy 1 -data_1m_str = fmt(data_1m_value / MILLION, precision=0) + "M" -data_10m_str = fmt(data_10m_value / MILLION, precision=0) + "M" -acc_gain_10x_str = fmt(acc_gain_10x_value, precision=0, commas=False) -curated_size_str = fmt(curated_size_value / 1e3, precision=0) + "K" -raw_size_str = fmt(raw_size_value / MILLION, precision=0) + "M" -curated_accuracy_str = fmt(curated_accuracy_value, precision=0, commas=False) -raw_accuracy_str = fmt(raw_accuracy_value, precision=0, commas=False) -curated_cost_ratio_str = fmt(curated_cost_ratio_value, precision=0, commas=False) + # ┌── 2. CALCULATION (The Physics) ──────────────────────────────────────── + curated_cost_ratio_value = raw_size_value / curated_size_value + synthetic_acc_drop_value = synthetic_gen1_acc_value - synthetic_gen5_acc_value + savings_value = training_run_cost_value * efficiency_gain_pct_value / 100 + selection_overhead_ratio_value = selection_time_bad_value / training_time_value + selection_overhead_good_pct_value = selection_time_good_value / full_training_time_value * 100 + rare_class_count_value = int(total_samples_value * rare_class_pct_value / 100) + expected_rare_in_coreset_value = int(rare_class_count_value * coreset_pct_value / 100) + inflation_gap_value = inflated_acc_value - true_acc_value -# Fallacy 2 -synthetic_gen1_acc_str = fmt(synthetic_gen1_acc_value, precision=0, commas=False) -synthetic_gen5_acc_str = fmt(synthetic_gen5_acc_value, precision=0, commas=False) -synthetic_acc_drop_str = fmt(synthetic_acc_drop_value, precision=0, commas=False) -optimal_synthetic_mix_str = f"{optimal_synthetic_mix_min_value}–{optimal_synthetic_mix_max_value}" + # ┌── 3. INVARIANTS (Guardrails) ────────────────────────────────────────── + check(curated_accuracy_value > raw_accuracy_value, + "Curated small dataset must outperform raw large dataset.") + check(synthetic_acc_drop_value > 0, "Model collapse must degrade accuracy.") + check(expected_rare_in_coreset_value < min_samples_threshold_value, + "Random coreset should drop rare class below minimum threshold.") -# Fallacy 4 -training_run_cost_str = fmt(training_run_cost_value / MILLION, precision=0) + "M" -efficiency_gain_pct_str = fmt(efficiency_gain_pct_value, precision=0, commas=False) -savings_str = fmt(savings_value / MILLION, precision=0) + "M" + # ┌── 4. OUTPUTS (Formatting) ───────────────────────────────────────────── + # Fallacy 1 + data_1m_str = fmt(data_1m_value / MILLION, precision=0) + "M" + data_10m_str = fmt(data_10m_value / MILLION, precision=0) + "M" + acc_gain_10x_str = fmt(acc_gain_10x_value, precision=0, commas=False) + curated_size_str = fmt(curated_size_value / 1e3, precision=0) + "K" + raw_size_str = fmt(raw_size_value / MILLION, precision=0) + "M" + curated_accuracy_str = fmt(curated_accuracy_value, precision=0, commas=False) + raw_accuracy_str = fmt(raw_accuracy_value, precision=0, commas=False) + curated_cost_ratio_str = fmt(curated_cost_ratio_value, precision=0, commas=False) -# Pitfall 1 -selection_time_bad_str = fmt(selection_time_bad_value, precision=0, commas=False) -training_time_str = fmt(training_time_value, precision=0, commas=False) -full_training_time_str = fmt(full_training_time_value, precision=0, commas=False) -selection_overhead_ratio_str = fmt(selection_overhead_ratio_value, precision=0, commas=False) -selection_time_good_str = fmt(selection_time_good_value * 60, precision=0, commas=False) # in minutes -selection_overhead_good_pct_str = fmt(selection_overhead_good_pct_value, precision=0, commas=False) + # Fallacy 2 + synthetic_gen1_acc_str = fmt(synthetic_gen1_acc_value, precision=0, commas=False) + synthetic_gen5_acc_str = fmt(synthetic_gen5_acc_value, precision=0, commas=False) + synthetic_acc_drop_str = fmt(synthetic_acc_drop_value, precision=0, commas=False) + optimal_synthetic_mix_str = f"{optimal_synthetic_mix_min_value}–{optimal_synthetic_mix_max_value}" -# Pitfall 2 -total_samples_str = fmt(total_samples_value / MILLION, precision=0) + "M" -rare_class_pct_str = fmt(rare_class_pct_value, precision=1, commas=False) -rare_class_count_str = fmt(rare_class_count_value, precision=0, commas=True) -fp_coreset_pct_str = fmt(coreset_pct_value, precision=0, commas=False) -expected_rare_in_coreset_str = fmt(expected_rare_in_coreset_value, precision=0, commas=False) -min_samples_threshold_str = fmt(min_samples_threshold_value, precision=0, commas=False) + # Fallacy 4 + training_run_cost_str = fmt(training_run_cost_value / MILLION, precision=0) + "M" + efficiency_gain_pct_str = fmt(efficiency_gain_pct_value, precision=0, commas=False) + savings_str = fmt(savings_value / MILLION, precision=0) + "M" -# Pitfall 3 -test_contamination_pct_str = fmt(test_contamination_pct_value, precision=0, commas=False) -inflated_acc_str = fmt(inflated_acc_value, precision=0, commas=False) -true_acc_str = fmt(true_acc_value, precision=0, commas=False) -inflation_gap_str = fmt(inflation_gap_value, precision=0, commas=False) + # Pitfall 1 + selection_time_bad_str = fmt(selection_time_bad_value, precision=0, commas=False) + training_time_str = fmt(training_time_value, precision=0, commas=False) + full_training_time_str = fmt(full_training_time_value, precision=0, commas=False) + selection_overhead_ratio_str = fmt(selection_overhead_ratio_value, precision=0, commas=False) + selection_time_good_str = fmt(selection_time_good_value * 60, precision=0, commas=False) + selection_overhead_good_pct_str = fmt(selection_overhead_good_pct_value, precision=0, commas=False) -# Pitfall 4 -al_latency_days_str = fmt(al_latency_days_value, precision=0, commas=False) -model_drift_epochs_str = fmt(model_drift_epochs_value, precision=0, commas=False) -batch_size_small_str = fmt(batch_size_small_value, precision=0, commas=False) -batch_size_large_str = fmt(batch_size_large_value, precision=0, commas=True) + # Pitfall 2 + total_samples_str = fmt(total_samples_value / MILLION, precision=0) + "M" + rare_class_pct_str = fmt(rare_class_pct_value, precision=1, commas=False) + rare_class_count_str = fmt(rare_class_count_value, precision=0, commas=True) + fp_coreset_pct_str = fmt(coreset_pct_value, precision=0, commas=False) + expected_rare_in_coreset_str = fmt(expected_rare_in_coreset_value, precision=0, commas=False) + min_samples_threshold_str = fmt(min_samples_threshold_value, precision=0, commas=False) -# Fallacy 5 -cifar10_coreset_pct_str = fmt(cifar10_coreset_pct_value, precision=0, commas=False) -cifar10_acc_retained_str = fmt(cifar10_acc_retained_value, precision=0, commas=False) -imagenet_acc_retained_str = fmt(imagenet_acc_retained_value, precision=0, commas=False) -medical_acc_retained_str = fmt(medical_acc_retained_value, precision=0, commas=False) + # Pitfall 3 + test_contamination_pct_str = fmt(test_contamination_pct_value, precision=0, commas=False) + inflated_acc_str = fmt(inflated_acc_value, precision=0, commas=False) + true_acc_str = fmt(true_acc_value, precision=0, commas=False) + inflation_gap_str = fmt(inflation_gap_value, precision=0, commas=False) -# Pitfall 5 -coreset_size_pct_str = fmt(coreset_size_pct_value, precision=0, commas=False) -rare_class_acc_str = fmt(rare_class_acc_value, precision=0, commas=False) -majority_class_acc_str = fmt(majority_class_acc_value, precision=0, commas=False) + # Pitfall 4 + al_latency_days_str = fmt(al_latency_days_value, precision=0, commas=False) + model_drift_epochs_str = fmt(model_drift_epochs_value, precision=0, commas=False) + batch_size_small_str = fmt(batch_size_small_value, precision=0, commas=False) + batch_size_large_str = fmt(batch_size_large_value, precision=0, commas=True) + + # Fallacy 5 + cifar10_coreset_pct_str = fmt(cifar10_coreset_pct_value, precision=0, commas=False) + cifar10_acc_retained_str = fmt(cifar10_acc_retained_value, precision=0, commas=False) + imagenet_acc_retained_str = fmt(imagenet_acc_retained_value, precision=0, commas=False) + medical_acc_retained_str = fmt(medical_acc_retained_value, precision=0, commas=False) + + # Pitfall 5 + coreset_size_pct_str = fmt(coreset_size_pct_value, precision=0, commas=False) + rare_class_acc_str = fmt(rare_class_acc_value, precision=0, commas=False) + majority_class_acc_str = fmt(majority_class_acc_value, precision=0, commas=False) + +# ┌── EXPORTS (Bridge to Text) ───────────────────────────────────────────────── +data_1m_str = FpScalingCalc.data_1m_str +data_10m_str = FpScalingCalc.data_10m_str +acc_gain_10x_str = FpScalingCalc.acc_gain_10x_str +curated_size_str = FpScalingCalc.curated_size_str +raw_size_str = FpScalingCalc.raw_size_str +curated_accuracy_str = FpScalingCalc.curated_accuracy_str +raw_accuracy_str = FpScalingCalc.raw_accuracy_str +curated_cost_ratio_str = FpScalingCalc.curated_cost_ratio_str +synthetic_gen1_acc_str = FpScalingCalc.synthetic_gen1_acc_str +synthetic_gen5_acc_str = FpScalingCalc.synthetic_gen5_acc_str +synthetic_acc_drop_str = FpScalingCalc.synthetic_acc_drop_str +optimal_synthetic_mix_str = FpScalingCalc.optimal_synthetic_mix_str +training_run_cost_str = FpScalingCalc.training_run_cost_str +efficiency_gain_pct_str = FpScalingCalc.efficiency_gain_pct_str +savings_str = FpScalingCalc.savings_str +selection_time_bad_str = FpScalingCalc.selection_time_bad_str +training_time_str = FpScalingCalc.training_time_str +full_training_time_str = FpScalingCalc.full_training_time_str +selection_overhead_ratio_str = FpScalingCalc.selection_overhead_ratio_str +selection_time_good_str = FpScalingCalc.selection_time_good_str +selection_overhead_good_pct_str = FpScalingCalc.selection_overhead_good_pct_str +total_samples_str = FpScalingCalc.total_samples_str +rare_class_pct_str = FpScalingCalc.rare_class_pct_str +rare_class_count_str = FpScalingCalc.rare_class_count_str +fp_coreset_pct_str = FpScalingCalc.fp_coreset_pct_str +expected_rare_in_coreset_str = FpScalingCalc.expected_rare_in_coreset_str +min_samples_threshold_str = FpScalingCalc.min_samples_threshold_str +test_contamination_pct_str = FpScalingCalc.test_contamination_pct_str +inflated_acc_str = FpScalingCalc.inflated_acc_str +true_acc_str = FpScalingCalc.true_acc_str +inflation_gap_str = FpScalingCalc.inflation_gap_str +al_latency_days_str = FpScalingCalc.al_latency_days_str +model_drift_epochs_str = FpScalingCalc.model_drift_epochs_str +batch_size_small_str = FpScalingCalc.batch_size_small_str +batch_size_large_str = FpScalingCalc.batch_size_large_str +cifar10_coreset_pct_str = FpScalingCalc.cifar10_coreset_pct_str +cifar10_acc_retained_str = FpScalingCalc.cifar10_acc_retained_str +imagenet_acc_retained_str = FpScalingCalc.imagenet_acc_retained_str +medical_acc_retained_str = FpScalingCalc.medical_acc_retained_str +coreset_size_pct_str = FpScalingCalc.coreset_size_pct_str +rare_class_acc_str = FpScalingCalc.rare_class_acc_str +majority_class_acc_str = FpScalingCalc.majority_class_acc_str ``` **Fallacy:** *Data is the new oil, so more is always better.* diff --git a/book/quarto/contents/vol1/ml_systems/ml_systems.qmd b/book/quarto/contents/vol1/ml_systems/ml_systems.qmd index 03ff3c5c3..72da1b43a 100644 --- a/book/quarto/contents/vol1/ml_systems/ml_systems.qmd +++ b/book/quarto/contents/vol1/ml_systems/ml_systems.qmd @@ -58,21 +58,32 @@ The defining insight of ML systems engineering is that constraints drive archite # ┌───────────────────────────────────────────────────────────────────────────── # │ CHAPTER-WIDE DEPLOYMENT SPECTRUM CONSTANTS # ├───────────────────────────────────────────────────────────────────────────── -# │ Context: Used across entire chapter — deployment tables, paradigm sections, -# │ physical constraints narrative, and Lighthouse Model summaries. +# │ Context: Used across entire chapter — @tbl-deployment-paradigms-overview, +# │ @sec-ml-systems-deployment-spectrum-71be (Power/Memory Wall prose), +# │ @sec-ml-systems-cloud-ml-maximizing-computational-power-a338, +# │ @sec-ml-systems-mobile-ml-personal-offline-intelligence-0983, +# │ and @sec-ml-systems-fallacies-pitfalls-3dfe. # │ -# │ Goal: Provide foundational parameters for the deployment spectrum. -# │ Show: Quantitative trade-offs across Cloud, Edge, Mobile, and TinyML. -# │ How: Centralize latency, power, and memory specs from mlsys.constants. +# │ Goal: Centralize foundational parameters for all four deployment paradigms. +# │ Show: Quantitative trade-offs (latency, power, memory) across Cloud, Edge, +# │ Mobile, and TinyML used in tables and narrative throughout the chapter. +# │ How: Read hardware specs from Tiers and Hardware twins; compute GPT-3 +# │ petaflop-days from training_ops; derive mobile tier specs from phone twin. # │ -# │ Imports: mlsys.constants, mlsys.formatting -# │ Exports: *_range_str (latency/RAM/storage), *_str (paradigm specs), gpt3_*_str +# │ Imports: mlsys.constants (MOBILE_TDP_W, PHONE_BATTERY_WH, TFLOPs, etc.) +# │ Exports: *_range_str (latency/RAM/storage), mobile_*_str, gpt3_*_str, +# │ phone_battery_str, compute_doubling_months_str, mem_bw_growth_pct_str +# │ +# │ Note: phone_battery_str, mobile_bw_range_str, mobile_tdp_range_str are also +# │ consumed at @sec-ml-systems-mobile-ml-personal-offline-intelligence-0983 +# │ (~2300 lines below) and @sec-ml-systems-fallacies-pitfalls-3dfe +# │ (~3200 lines below). These exports are intentionally chapter-scoped. # └───────────────────────────────────────────────────────────────────────────── from mlsys import Tiers, Hardware, Models from mlsys.constants import ( MOBILE_TDP_W, PHONE_BATTERY_WH, DLRM_MODEL_SIZE_FP32, TFLOPs, PFLOPs, Kparam, second, watt, hour, GB, SEC_PER_DAY, - BILLION, MILLION, TRILLION, THOUSAND + BILLION, MILLION, TRILLION, THOUSAND, ureg ) from mlsys.formatting import fmt, check @@ -100,7 +111,7 @@ class MLSystemsSetup: # Assumptions (ranges) mobile_ram_range = "8-16" mobile_storage_range = "128 GB-1 TB" - mobile_bw_range = f"{int(h_phone.memory_bw.to('GB/s').magnitude/2)}-{int(h_phone.memory_bw.to('GB/s').magnitude)}" + mobile_bw_range = f"{int(h_phone.memory_bw.m_as('GB/s')/2)}-{int(h_phone.memory_bw.m_as('GB/s'))}" # Latency ranges (ms) cloud_latency_range = "100-500" @@ -118,13 +129,13 @@ class MLSystemsSetup: gpt3_v100_count = 10000 # Mobile Specifics - mobile_tdp_w = h_phone.tdp.to(watt).magnitude if h_phone.tdp else 3 - mobile_npu_tops = h_phone.peak_flops.to(TFLOPs/second).magnitude - phone_battery_wh = h_phone.battery_capacity.to('Wh').magnitude if h_phone.battery_capacity else 15 + mobile_tdp_w = h_phone.tdp.m_as(watt) if h_phone.tdp else 3 + mobile_npu_tops = h_phone.peak_flops.m_as(TFLOPs/second) + phone_battery_wh = h_phone.battery_capacity.m_as('Wh') if h_phone.battery_capacity else 15 # ┌── 2. CALCULATION (The Physics) ───────────────────────────────────────── # GPT-3 Petaflop-days calculation using standardized units - gpt3_petaflop_days = (m_gpt3.training_ops / (PFLOPs * SEC_PER_DAY)).to_base_units().magnitude + gpt3_petaflop_days = (m_gpt3.training_ops / (PFLOPs * SEC_PER_DAY)).to_base_units().m_as(ureg.dimensionless) # ┌── 3. INVARIANTS (Guardrails) ─────────────────────────────────────────── check(gpt3_petaflop_days >= 3000, f"GPT-3 training should be >=3000 PF-days, got {gpt3_petaflop_days:.0f}") @@ -154,11 +165,11 @@ class MLSystemsSetup: mobile_npu_range_str = "1-10" phone_battery_str = fmt(phone_battery_wh, precision=0) - kws_params_str = fmt(m_kws.parameters.to(Kparam).magnitude, precision=0, commas=True) + kws_params_str = fmt(m_kws.parameters.m_as(Kparam), precision=0, commas=True) kws_size_kb_str = "100" # Approximate # DLRM Embedding (using Models Twin) - dlrm_embedding_str = fmt(Models.DLRM.model_size.to(GB).magnitude, precision=0) + dlrm_embedding_str = fmt(Models.DLRM.model_size.m_as(GB), precision=0) # ┌── P.I.C.O. ISOLATED SCENARIO ─────────────────────────────────────────────── class ThrottlingScenario: @@ -445,7 +456,7 @@ class LightLatency: # ┌── 2. CALCULATION (The Physics) ───────────────────────────────────────── # Latency = (Distance * 2) / Speed of Light (Round-trip time) min_latency = (distance_km * 2) / SPEED_OF_LIGHT_FIBER_KM_S - min_latency_ms = min_latency.to(ureg.ms).magnitude + min_latency_ms = min_latency.m_as(ureg.ms) # ┌── 3. INVARIANTS (Guardrails) ─────────────────────────────────────────── check(min_latency_ms > safety_budget_ms, @@ -453,7 +464,7 @@ class LightLatency: # ┌── 4. OUTPUTS (Formatting) ────────────────────────────────────────────── min_latency_str = fmt(min_latency_ms, precision=0, commas=False) - distance_str = f"{distance_km.magnitude:,}" + distance_str = f"{distance_km.m_as('km'):,}" # ┌── EXPORTS (Bridge to Text) ───────────────────────────────────────────────── min_latency_str = LightLatency.min_latency_str @@ -694,14 +705,26 @@ These archetypes map naturally to deployment paradigms: **Compute Beasts** and * # ┌───────────────────────────────────────────────────────────────────────────── # │ LIGHTHOUSE MODEL SPECIFICATIONS # ├───────────────────────────────────────────────────────────────────────────── -# │ Context: Lighthouse Models callout (Five Reference Workloads) +# │ Context: @sec-ml-systems-workload-archetypes-fd10 — "Five Reference Workloads" +# │ callout; also @sec-ml-systems-system-balance-hardware-96ab +# │ (ResNet-50 bottleneck example, @tbl-representative-systems). # │ -# │ Goal: Provide specifications for the five Lighthouse Models. -# │ Show: Dimensionality and compute scale for ResNet, GPT, DLRM, and KWS. -# │ How: Retrieve parameters and FLOPs from mlsys.constants and Models. +# │ Goal: Provide specs for the five Lighthouse Models (ResNet-50, GPT-2/Llama, +# │ DLRM, MobileNetV2, KWS DS-CNN). +# │ Show: Parameter count, FLOP profile, and memory footprint that anchor +# │ each workload archetype to a concrete system. +# │ How: Retrieve parameters and FLOPs from Models twin; derive sizes using +# │ size_in_bytes() with 4-byte (FP32) precision. # │ -# │ Imports: mlsys.constants (RESNET50_*, GPT2_PARAMS), mlsys.formatting (fmt) -# │ Exports: resnet_*_str, gpt2_*_str, llama_range_str +# │ Imports: mlsys.constants (RESNET50_FLOPs, GFLOPs, Mparam, Bparam, etc.) +# │ Exports: resnet_gflops_str, resnet_params_m_str, resnet_fp32_mb_str, +# │ gpt2_params_b_str, llama_range_str, dlrm_embedding_str, +# │ mobilenet_flops_reduction_str, mobile_tdp_range_str, +# │ kws_params_str, kws_size_kb_str +# │ +# │ Note: mobile_tdp_range_str and mobilenet_flops_reduction_str are also +# │ consumed at @sec-ml-systems-mobile-ml-personal-offline-intelligence-0983 +# │ (~1400 lines below). These exports are intentionally chapter-scoped. # └───────────────────────────────────────────────────────────────────────────── from mlsys import Models from mlsys.constants import ( @@ -725,22 +748,22 @@ class LighthouseModels: m_kws = Models.Tiny.DS_CNN # ┌── 2. CALCULATION (The Physics) ───────────────────────────────────────── - resnet_flops_g = RESNET50_FLOPs.to(GFLOPs).magnitude - resnet_params_m = m_resnet.parameters.to(Mparam).magnitude - resnet_fp32_mb = m_resnet.size_in_bytes(4 * byte).to(MB).magnitude + resnet_flops_g = RESNET50_FLOPs.m_as(GFLOPs) + resnet_params_m = m_resnet.parameters.m_as(Mparam) + resnet_fp32_mb = m_resnet.size_in_bytes(4 * byte).m_as(MB) - gpt2_params_b = m_gpt2.parameters.to(Bparam).magnitude + gpt2_params_b = m_gpt2.parameters.m_as(Bparam) # DLRM Embedding Size - dlrm_embedding_gb = m_dlrm.model_size.to(GB).magnitude + dlrm_embedding_gb = m_dlrm.model_size.m_as(GB) # MobileNet # ResNet-50 ~4.1 GFLOPs, MobileNetV2 ~300 MFLOPs mobilenet_flops_reduction = 4100 / 300 # KWS - kws_params = m_kws.parameters.to(Kparam).magnitude - kws_size_kb = m_kws.size_in_bytes(4 * byte).to(KB).magnitude + kws_params = m_kws.parameters.m_as(Kparam) + kws_size_kb = m_kws.size_in_bytes(4 * byte).m_as(KB) # ┌── 3. INVARIANTS (Guardrails) ─────────────────────────────────────────── check(resnet_fp32_mb >= 90, f"ResNet50 size should be ~98MB, got {resnet_fp32_mb:.0f}MB") @@ -812,39 +835,64 @@ With the analytical tools (Iron Law, Bottleneck Principle, Workload Archetypes) ```{python} #| label: latency-constants #| echo: false + # ┌───────────────────────────────────────────────────────────────────────────── # │ LATENCY NUMBERS FOR ML SYSTEM DESIGN # ├───────────────────────────────────────────────────────────────────────────── -# │ Context: Table "Latency Numbers for ML System Design" +# │ Context: @tbl-latency-numbers in @sec-ml-systems-system-balance-hardware-96ab # │ -# │ Goal: Provide reference latencies for system design. -# │ Show: The 8-order-of-magnitude gap between register access and cloud RTT. -# │ How: List representative constants from NS to 100s of MS. +# │ Goal: Populate the 14-row latency reference table spanning compute, memory, +# │ network, and ML operation categories. +# │ Show: The 8-order-of-magnitude gap from nanosecond register access to +# │ hundreds-of-milliseconds cross-region network RTT. +# │ How: Assign representative string constants derived from hardware specs and +# │ published measurements; no arithmetic required. # │ -# │ Imports: (none — display constants only) -# │ Exports: lat_*_str variables for table +# │ Imports: (none — display string constants only) +# │ Exports: lat_compute_str, lat_npu_str, lat_llm_str, lat_l1_str, lat_hbm_str, +# │ lat_dram_str, lat_net_dc_str, lat_net_region_str, lat_net_cross_str, +# │ lat_kws_str, lat_face_str, lat_gpt4_str, lat_train_str # └───────────────────────────────────────────────────────────────────────────── -# --- Outputs: Compute latencies --- -lat_compute_str = "~1 ns" # GPU matrix multiply (per op) -lat_npu_str = "5–20 ms" # NPU inference (MobileNet) -lat_llm_str = "20–100 ms" # LLM token generation +# ┌── P.I.C.O. ISOLATED SCENARIO ─────────────────────────────────────────────── +class LatencyConstants: + """Namespace for Latency Constants.""" -# --- Outputs: Memory latencies --- -lat_l1_str = "~1 ns" # L1 cache hit -lat_hbm_str = "20–50 ns" # HBM read (GPU) -lat_dram_str = "50–100 ns" # DRAM read (mobile) + # ┌── 4. OUTPUTS (Formatting) ───────────────────────────────────────────── + lat_compute_str = "~1 ns" # GPU matrix multiply (per op) + lat_npu_str = "5–20 ms" # NPU inference (MobileNet) + lat_llm_str = "20–100 ms" # LLM token generation -# --- Outputs: Network latencies --- -lat_net_dc_str = "0.5 ms" # same datacenter -lat_net_region_str = "1–5 ms" # same region -lat_net_cross_str = "50–150 ms" # cross-region + # ┌── 4. OUTPUTS (Formatting) ───────────────────────────────────────────── + lat_l1_str = "~1 ns" # L1 cache hit + lat_hbm_str = "20–50 ns" # HBM read (GPU) + lat_dram_str = "50–100 ns" # DRAM read (mobile) -# --- Outputs: ML operation latencies --- -lat_kws_str = "100 μs" # wake-word detection (TinyML) -lat_face_str = "10–30 ms" # face detection (mobile) -lat_gpt4_str = "200–500 ms" # GPT-4 first token -lat_train_str = "200–400 ms" # ResNet-50 training step + # ┌── 4. OUTPUTS (Formatting) ───────────────────────────────────────────── + lat_net_dc_str = "0.5 ms" # same datacenter + lat_net_region_str = "1–5 ms" # same region + lat_net_cross_str = "50–150 ms" # cross-region + + # ┌── 4. OUTPUTS (Formatting) ───────────────────────────────────────────── + lat_kws_str = "100 μs" # wake-word detection (TinyML) + lat_face_str = "10–30 ms" # face detection (mobile) + lat_gpt4_str = "200–500 ms" # GPT-4 first token + lat_train_str = "200–400 ms" # ResNet-50 training step + +# ┌── EXPORTS (Bridge to Text) ───────────────────────────────────────────────── +lat_compute_str = LatencyConstants.lat_compute_str +lat_npu_str = LatencyConstants.lat_npu_str +lat_llm_str = LatencyConstants.lat_llm_str +lat_l1_str = LatencyConstants.lat_l1_str +lat_hbm_str = LatencyConstants.lat_hbm_str +lat_dram_str = LatencyConstants.lat_dram_str +lat_net_dc_str = LatencyConstants.lat_net_dc_str +lat_net_region_str = LatencyConstants.lat_net_region_str +lat_net_cross_str = LatencyConstants.lat_net_cross_str +lat_kws_str = LatencyConstants.lat_kws_str +lat_face_str = LatencyConstants.lat_face_str +lat_gpt4_str = LatencyConstants.lat_gpt4_str +lat_train_str = LatencyConstants.lat_train_str ``` These latencies, organized by category in @tbl-latency-numbers, span eight orders of magnitude: @@ -911,6 +959,7 @@ The following worked example demonstrates how to apply this analysis quantitativ ```{python} #| echo: false #| label: resnet-setup + # ┌───────────────────────────────────────────────────────────────────────────── # │ RESNET-50 MODEL SIZE SETUP # ├───────────────────────────────────────────────────────────────────────────── @@ -923,27 +972,43 @@ The following worked example demonstrates how to apply this analysis quantitativ # │ Imports: mlsys.constants (RESNET50_FLOPs, RESNET50_PARAMS), mlsys.formatting # │ Exports: resnet_*_str (GFLOPs, params, MB at each precision) # └───────────────────────────────────────────────────────────────────────────── + from mlsys.constants import RESNET50_FLOPs, RESNET50_PARAMS, GFLOPs, Mparam, byte, MB from mlsys.formatting import fmt, check -# --- Process (model sizes at different precisions) --- -resnet_fp32_bytes_value = RESNET50_PARAMS.magnitude * 4 * byte # 4 bytes per FP32 param -resnet_fp16_bytes_value = RESNET50_PARAMS.magnitude * 2 * byte # 2 bytes per FP16 param -resnet_int8_bytes_value = RESNET50_PARAMS.magnitude * 1 * byte # 1 byte per INT8 param -resnet_gflops_value = RESNET50_FLOPs.to(GFLOPs).magnitude -resnet_params_m_value = RESNET50_PARAMS.to(Mparam).magnitude +# ┌── P.I.C.O. ISOLATED SCENARIO ─────────────────────────────────────────────── +class ResnetSetup: + """Namespace for Resnet Setup.""" -# --- Outputs (formatted strings for prose) --- -resnet_gflops_str = fmt(resnet_gflops_value, precision=1, commas=False) # e.g. "4.1" GFLOPs -resnet_params_m_str = fmt(resnet_params_m_value, precision=1, commas=False) # e.g. "25.6" M -resnet_fp32_mb_str = fmt(resnet_fp32_bytes_value.to(MB).magnitude, precision=0, commas=False) # e.g. "102" MB -resnet_fp16_mb_str = fmt(resnet_fp16_bytes_value.to(MB).magnitude, precision=0, commas=False) # e.g. "51" MB -resnet_int8_mb_str = fmt(resnet_int8_bytes_value.to(MB).magnitude, precision=0, commas=False) # e.g. "26" MB + # ┌── 2. CALCULATION (The Physics) ──────────────────────────────────────── + resnet_fp32_bytes_value = RESNET50_PARAMS.m_as('param') * 4 * byte # 4 bytes per FP32 param + resnet_fp16_bytes_value = RESNET50_PARAMS.m_as('param') * 2 * byte # 2 bytes per FP16 param + resnet_int8_bytes_value = RESNET50_PARAMS.m_as('param') * 1 * byte # 1 byte per INT8 param + resnet_gflops_value = RESNET50_FLOPs.m_as(GFLOPs) + resnet_params_m_value = RESNET50_PARAMS.m_as(Mparam) + + # ┌── 4. OUTPUTS (Formatting) ───────────────────────────────────────────── + resnet_gflops_str = fmt(resnet_gflops_value, precision=1, commas=False) # e.g. "4.1" GFLOPs + resnet_params_m_str = fmt(resnet_params_m_value, precision=1, commas=False) # e.g. "25.6" M + resnet_fp32_mb_str = fmt(resnet_fp32_bytes_value.m_as(MB), precision=0, commas=False) # e.g. "102" MB + resnet_fp16_mb_str = fmt(resnet_fp16_bytes_value.m_as(MB), precision=0, commas=False) # e.g. "51" MB + resnet_int8_mb_str = fmt(resnet_int8_bytes_value.m_as(MB), precision=0, commas=False) # e.g. "26" MB + +# ┌── EXPORTS (Bridge to Text) ───────────────────────────────────────────────── +resnet_fp16_mb_str = ResnetSetup.resnet_fp16_mb_str +resnet_fp32_mb_str = ResnetSetup.resnet_fp32_mb_str +resnet_gflops_str = ResnetSetup.resnet_gflops_str +resnet_int8_mb_str = ResnetSetup.resnet_int8_mb_str +resnet_params_m_str = ResnetSetup.resnet_params_m_str +# Quantity values needed by downstream cells (ResnetCloud, ResnetMobile class bodies) +resnet_fp16_bytes_value = ResnetSetup.resnet_fp16_bytes_value +resnet_int8_bytes_value = ResnetSetup.resnet_int8_bytes_value ``` ```{python} #| echo: false #| label: resnet-cloud + # ┌───────────────────────────────────────────────────────────────────────────── # │ RESNET-50 CLOUD (A100) BOTTLENECK ANALYSIS # ├───────────────────────────────────────────────────────────────────────────── @@ -956,6 +1021,7 @@ resnet_int8_mb_str = fmt(resnet_int8_bytes_value.to(MB).magnitude, precision=0, # │ Imports: mlsys.constants (A100_*, RESNET50_*), mlsys.formulas (calc_bottleneck) # │ Exports: a100_*_str, cloud_*_str, cloud_*_frac (Markdown fractions) # └───────────────────────────────────────────────────────────────────────────── + from mlsys import Hardware from mlsys.constants import ( RESNET50_FLOPs, A100_FLOPS_FP16_TENSOR, A100_MEM_BW, @@ -964,43 +1030,62 @@ from mlsys.constants import ( from mlsys.formulas import calc_bottleneck from mlsys.formatting import sci, fmt, sci_latex, md_frac -# --- Process (bottleneck analysis using Hardware Twin) --- -h_a100 = Hardware.A100 -cloud_stats = calc_bottleneck( - ops=RESNET50_FLOPs, - model_bytes=resnet_fp16_bytes_value, # from resnet-setup cell - device_flops=h_a100.peak_flops, - device_bw=h_a100.memory_bw, -) -a100_tflops_value = h_a100.peak_flops.to(TFLOPs / second).magnitude -a100_bw_tbs_value = h_a100.memory_bw.to(TB / second).magnitude -cloud_compute_ms_value = cloud_stats["compute_ms"] -cloud_memory_ms_value = cloud_stats["memory_ms"] -cloud_ratio_x_value = cloud_stats["ratio"] -cloud_ai_value = cloud_stats["intensity"] -cloud_bottleneck_value = cloud_stats["bottleneck"] +# ┌── P.I.C.O. ISOLATED SCENARIO ─────────────────────────────────────────────── +class ResnetCloud: + """Namespace for Resnet Cloud.""" -# --- LaTeX fraction components (for nice rendering) --- -resnet_flops_latex = sci_latex(RESNET50_FLOPs.to(flop)) -a100_flops_latex = sci_latex(h_a100.peak_flops.to(flop / second)) -resnet_fp16_bytes_latex = sci_latex(resnet_fp16_bytes_value.to(byte)) -a100_bw_latex = sci_latex(h_a100.memory_bw.to(byte / second)) -cloud_compute_frac = md_frac(resnet_flops_latex, a100_flops_latex, f"{cloud_compute_ms_value:.3f}", "ms") -cloud_memory_frac = md_frac(resnet_fp16_bytes_latex, a100_bw_latex, f"{cloud_memory_ms_value:.3f}", "ms") -cloud_ai_frac = md_frac(resnet_flops_latex, resnet_fp16_bytes_latex, f"{cloud_ai_value:.0f}", "FLOPs/byte") + # ┌── 2. CALCULATION (The Physics) ──────────────────────────────────────── + h_a100 = Hardware.A100 + cloud_stats = calc_bottleneck( + ops=RESNET50_FLOPs, + model_bytes=resnet_fp16_bytes_value, # from resnet-setup cell + device_flops=h_a100.peak_flops, + device_bw=h_a100.memory_bw, + ) + a100_tflops_value = h_a100.peak_flops.m_as(TFLOPs / second) + a100_bw_tbs_value = h_a100.memory_bw.m_as(TB / second) + cloud_compute_ms_value = cloud_stats["compute_ms"] + cloud_memory_ms_value = cloud_stats["memory_ms"] + cloud_ratio_x_value = cloud_stats["ratio"] + cloud_ai_value = cloud_stats["intensity"] + cloud_bottleneck_value = cloud_stats["bottleneck"] -# --- Outputs (formatted strings for prose) --- -a100_tflops_str = fmt(a100_tflops_value, precision=0, commas=False) # e.g. "312" TFLOPS -a100_bw_tbs_str = fmt(a100_bw_tbs_value, precision=0, commas=False) # e.g. "2" TB/s -cloud_compute_ms_str = fmt(cloud_compute_ms_value, precision=3, commas=False) -cloud_memory_ms_str = fmt(cloud_memory_ms_value, precision=3, commas=False) -cloud_ratio_x_str = fmt(cloud_ratio_x_value, precision=0, commas=False) # memory/compute ratio -cloud_bottleneck_str = cloud_bottleneck_value # "Memory" or "Compute" + # --- LaTeX fraction components (for nice rendering) --- + resnet_flops_latex = sci_latex(RESNET50_FLOPs.to(flop)) + a100_flops_latex = sci_latex(h_a100.peak_flops.to(flop / second)) + resnet_fp16_bytes_latex = sci_latex(resnet_fp16_bytes_value.to(byte)) + a100_bw_latex = sci_latex(h_a100.memory_bw.to(byte / second)) + cloud_compute_frac = md_frac(resnet_flops_latex, a100_flops_latex, f"{cloud_compute_ms_value:.3f}", "ms") + cloud_memory_frac = md_frac(resnet_fp16_bytes_latex, a100_bw_latex, f"{cloud_memory_ms_value:.3f}", "ms") + cloud_ai_frac = md_frac(resnet_flops_latex, resnet_fp16_bytes_latex, f"{cloud_ai_value:.0f}", "FLOPs/byte") + + # ┌── 4. OUTPUTS (Formatting) ───────────────────────────────────────────── + a100_tflops_str = fmt(a100_tflops_value, precision=0, commas=False) # e.g. "312" TFLOPS + a100_bw_tbs_str = fmt(a100_bw_tbs_value, precision=0, commas=False) # e.g. "2" TB/s + cloud_compute_ms_str = fmt(cloud_compute_ms_value, precision=3, commas=False) + cloud_memory_ms_str = fmt(cloud_memory_ms_value, precision=3, commas=False) + cloud_ratio_x_str = fmt(cloud_ratio_x_value, precision=0, commas=False) # memory/compute ratio + cloud_bottleneck_str = cloud_bottleneck_value # "Memory" or "Compute" + +# ┌── EXPORTS (Bridge to Text) ───────────────────────────────────────────────── +a100_bw_tbs_str = ResnetCloud.a100_bw_tbs_str +a100_tflops_str = ResnetCloud.a100_tflops_str +cloud_bottleneck_str = ResnetCloud.cloud_bottleneck_str +cloud_compute_ms_str = ResnetCloud.cloud_compute_ms_str +cloud_memory_ms_str = ResnetCloud.cloud_memory_ms_str +cloud_ratio_x_str = ResnetCloud.cloud_ratio_x_str +cloud_compute_frac = ResnetCloud.cloud_compute_frac +cloud_memory_frac = ResnetCloud.cloud_memory_frac +cloud_ai_frac = ResnetCloud.cloud_ai_frac +# Values needed by downstream ResnetMobile class body +cloud_stats = ResnetCloud.cloud_stats +resnet_flops_latex = ResnetCloud.resnet_flops_latex ``` ```{python} #| echo: false #| label: resnet-mobile + # ┌───────────────────────────────────────────────────────────────────────────── # │ RESNET-50 MOBILE (NPU) BOTTLENECK ANALYSIS # ├───────────────────────────────────────────────────────────────────────────── @@ -1013,6 +1098,7 @@ cloud_bottleneck_str = cloud_bottleneck_value # "Me # │ Imports: mlsys.constants (MOBILE_NPU_*, A100_MEM_BW), mlsys.formulas # │ Exports: mobile_*_str, bw_advantage_x_str, inference_speed_x_str # └───────────────────────────────────────────────────────────────────────────── + from mlsys import Hardware, Models from mlsys.constants import ( RESNET50_FLOPs, MOBILE_NPU_TOPS_INT8, MOBILE_NPU_MEM_BW, A100_MEM_BW, @@ -1021,42 +1107,56 @@ from mlsys.constants import ( from mlsys.formulas import calc_bottleneck from mlsys.formatting import sci_latex, md_frac, fmt -# --- Process (bottleneck analysis using Hardware/Models Twins) --- -h_phone = Hardware.Edge.Generic_Phone -m_resnet = Models.ResNet50 -h_a100 = Hardware.A100 +# ┌── P.I.C.O. ISOLATED SCENARIO ─────────────────────────────────────────────── +class ResnetMobile: + """Namespace for Resnet Mobile.""" -mobile_stats = calc_bottleneck( - ops=m_resnet.inference_flops, - model_bytes=resnet_int8_bytes_value, # from resnet-setup cell - device_flops=h_phone.peak_flops, - device_bw=h_phone.memory_bw, -) -mobile_tops_value = h_phone.peak_flops.to(TFLOPs / second).magnitude -mobile_bw_gbs_value = h_phone.memory_bw.to(GB / second).magnitude -mobile_compute_ms_value = mobile_stats["compute_ms"] -mobile_memory_ms_value = mobile_stats["memory_ms"] -mobile_ratio_x_value = mobile_stats["ratio"] -mobile_bottleneck_value = mobile_stats["bottleneck"] + # ┌── 2. CALCULATION (The Physics) ──────────────────────────────────────── + h_phone = Hardware.Edge.Generic_Phone + m_resnet = Models.ResNet50 + h_a100 = Hardware.A100 -# --- Cross-platform comparison --- -bw_advantage_x_value = h_a100.memory_bw / h_phone.memory_bw -inference_speed_x_value = mobile_memory_ms_value / cloud_stats["memory_ms"] # uses cloud_stats + mobile_stats = calc_bottleneck( + ops=m_resnet.inference_flops, + model_bytes=resnet_int8_bytes_value, # from resnet-setup cell + device_flops=h_phone.peak_flops, + device_bw=h_phone.memory_bw, + ) + mobile_tops_value = h_phone.peak_flops.m_as(TFLOPs / second) + mobile_bw_gbs_value = h_phone.memory_bw.m_as(GB / second) + mobile_compute_ms_value = mobile_stats["compute_ms"] + mobile_memory_ms_value = mobile_stats["memory_ms"] + mobile_ratio_x_value = mobile_stats["ratio"] + mobile_bottleneck_value = mobile_stats["bottleneck"] -# --- LaTeX fraction components (for nice rendering) --- -mobile_npu_flops_latex = sci_latex(h_phone.peak_flops.to(flop / second)) -resnet_int8_bytes_latex = sci_latex(resnet_int8_bytes_value.to(byte)) -mobile_npu_bw_latex = sci_latex(h_phone.memory_bw.to(byte / second)) -mobile_compute_frac = md_frac(resnet_flops_latex, mobile_npu_flops_latex, f"{mobile_compute_ms_value:.2f}", "ms") -mobile_memory_frac = md_frac(resnet_int8_bytes_latex, mobile_npu_bw_latex, f"{mobile_memory_ms_value:.2f}", "ms") + # --- Cross-platform comparison --- + bw_advantage_x_value = h_a100.memory_bw / h_phone.memory_bw + inference_speed_x_value = mobile_memory_ms_value / cloud_stats["memory_ms"] # uses cloud_stats -# --- Outputs (formatted strings for prose) --- -mobile_tops_str = fmt(mobile_tops_value, precision=0, commas=False) # e.g. "10" TOPS -mobile_bw_gbs_str = fmt(mobile_bw_gbs_value, precision=0, commas=False) # e.g. "50" GB/s -mobile_ratio_x_str = fmt(mobile_ratio_x_value, precision=0, commas=False) # memory/compute ratio -mobile_bottleneck_str = mobile_bottleneck_value # "Memory" or "Compute" -bw_advantage_x_str = fmt(bw_advantage_x_value, precision=0, commas=False) # A100 vs NPU bandwidth -inference_speed_x_str = fmt(inference_speed_x_value, precision=0, commas=False) # latency ratio + # --- LaTeX fraction components (for nice rendering) --- + mobile_npu_flops_latex = sci_latex(h_phone.peak_flops.to(flop / second)) + resnet_int8_bytes_latex = sci_latex(resnet_int8_bytes_value.to(byte)) + mobile_npu_bw_latex = sci_latex(h_phone.memory_bw.to(byte / second)) + mobile_compute_frac = md_frac(resnet_flops_latex, mobile_npu_flops_latex, f"{mobile_compute_ms_value:.2f}", "ms") + mobile_memory_frac = md_frac(resnet_int8_bytes_latex, mobile_npu_bw_latex, f"{mobile_memory_ms_value:.2f}", "ms") + + # ┌── 4. OUTPUTS (Formatting) ───────────────────────────────────────────── + mobile_tops_str = fmt(mobile_tops_value, precision=0, commas=False) # e.g. "10" TOPS + mobile_bw_gbs_str = fmt(mobile_bw_gbs_value, precision=0, commas=False) # e.g. "50" GB/s + mobile_ratio_x_str = fmt(mobile_ratio_x_value, precision=0, commas=False) # memory/compute ratio + mobile_bottleneck_str = mobile_bottleneck_value # "Memory" or "Compute" + bw_advantage_x_str = fmt(bw_advantage_x_value, precision=0, commas=False) # A100 vs NPU bandwidth + inference_speed_x_str = fmt(inference_speed_x_value, precision=0, commas=False) # latency ratio + +# ┌── EXPORTS (Bridge to Text) ───────────────────────────────────────────────── +mobile_tops_str = ResnetMobile.mobile_tops_str +mobile_bw_gbs_str = ResnetMobile.mobile_bw_gbs_str +mobile_ratio_x_str = ResnetMobile.mobile_ratio_x_str +mobile_bottleneck_str = ResnetMobile.mobile_bottleneck_str +mobile_compute_frac = ResnetMobile.mobile_compute_frac +mobile_memory_frac = ResnetMobile.mobile_memory_frac +bw_advantage_x_str = ResnetMobile.bw_advantage_x_str +inference_speed_x_str = ResnetMobile.inference_speed_x_str ``` ::: {.callout-notebook title="ResNet-50 on Cloud vs Mobile"} @@ -1103,18 +1203,30 @@ As systems transition from Cloud to Edge to TinyML, available resources decrease ```{python} #| label: hardware-spectrum-setup #| echo: false + # ┌───────────────────────────────────────────────────────────────────────────── # │ HARDWARE SPECTRUM: REPRESENTATIVE SYSTEMS TABLE # ├───────────────────────────────────────────────────────────────────────────── -# │ Context: Tables "Hardware Spectrum" and "Deployment Decision Thresholds" +# │ Context: @tbl-representative-systems in @sec-ml-systems-system-balance-hardware-96ab +# │ and the deployment decision thresholds table that follows it. # │ -# │ Goal: Ground abstract deployment paradigms in concrete hardware. -# │ Show: The 9-order-of-magnitude power gap between TinyML and Cloud pods. -# │ How: List memory, power, and cost specs for TPU Pods, workstations, and MCUs. +# │ Goal: Ground abstract deployment paradigms in concrete hardware specs for +# │ TPU v4 Pod (Cloud), DGX Spark (Edge), and ESP32-CAM (TinyML). +# │ Show: The 9-order-of-magnitude power gap (4 MW cloud to 0.1 W TinyML) +# │ and 8-order-of-magnitude cost gap ($millions to $10) across tiers. +# │ How: Read memory, power, and cost from mlsys.constants for each platform; +# │ assign threshold strings for the decision boundary table. # │ -# │ Imports: mlsys.constants (TPU_POD_*, DGX_*, ESP32_*), mlsys.formatting (fmt) -# │ Exports: tpu_*_str, edge_*_str, tiny_*_str, *_thresh_str +# │ Imports: mlsys.constants (TPU_POD_MEM, TPU_POD_POWER, TPU_POD_CHIPS, +# │ DGX_RAM, DGX_STORAGE, DGX_POWER, DGX_PRICE_MIN, DGX_PRICE_MAX, +# │ ESP32_RAM, ESP32_FLASH, ESP32_POWER_MIN, ESP32_POWER_MAX, ESP32_PRICE) +# │ Exports: tpu_chips_str, cloud_mem_tb_str, cloud_pwr_mw_str, edge_mem_gb_str, +# │ edge_stor_tb_str, edge_pwr_w_str, edge_price_min_str, edge_price_max_str, +# │ tiny_ram_kb_str, tiny_flash_mb_str, tiny_pwr_min_str, tiny_pwr_max_str, +# │ tiny_price_str, cloud_thresh_tflops_str, edge_thresh_pflops_str, +# │ tiny_thresh_tops_str, tiny_thresh_mw_str # └───────────────────────────────────────────────────────────────────────────── + from mlsys.constants import ( TPU_POD_MEM, TPU_POD_POWER, TPU_POD_CHIPS, DGX_RAM, DGX_STORAGE, DGX_POWER, DGX_PRICE_MIN, DGX_PRICE_MAX, @@ -1123,32 +1235,57 @@ from mlsys.constants import ( ) from mlsys.formatting import fmt, check -# --- Outputs: Cloud (TPU v4 Pod) --- -tpu_chips_str = f"{TPU_POD_CHIPS:,}" # e.g. "4,096" chips -cloud_mem_tb_str = fmt(TPU_POD_MEM.to(TB).magnitude, precision=0, commas=False) # e.g. "131" TB -cloud_pwr_mw_str = fmt(TPU_POD_POWER.to("megawatt").magnitude, precision=0, commas=False) # e.g. "4" MW +# ┌── P.I.C.O. ISOLATED SCENARIO ─────────────────────────────────────────────── +class HardwareSpectrumSetup: + """Namespace for Hardware Spectrum Setup.""" -# --- Outputs: Edge (DGX Spark) --- -edge_mem_gb_str = fmt(DGX_RAM.to(GB).magnitude, precision=0, commas=False) # e.g. "128" GB -edge_stor_tb_str = fmt(DGX_STORAGE.to(TB).magnitude, precision=0, commas=False) # e.g. "4" TB -edge_pwr_w_str = fmt(DGX_POWER.to(watt).magnitude, precision=0, commas=False) # e.g. "500" W -edge_price_min_str = f"{DGX_PRICE_MIN.magnitude:,.0f}" # e.g. "3,000" -edge_price_max_str = f"{DGX_PRICE_MAX.magnitude:,.0f}" # e.g. "5,000" + # ┌── 4. OUTPUTS (Formatting) ───────────────────────────────────────────── + tpu_chips_str = f"{TPU_POD_CHIPS:,}" # e.g. "4,096" chips + cloud_mem_tb_str = fmt(TPU_POD_MEM.m_as(TB), precision=0, commas=False) # e.g. "131" TB + cloud_pwr_mw_str = fmt(TPU_POD_POWER.m_as("megawatt"), precision=0, commas=False) # e.g. "4" MW -# --- Outputs: TinyML (ESP32-CAM) --- -tiny_ram_kb_str = fmt(ESP32_RAM.to(KiB).magnitude, precision=0, commas=False) # e.g. "520" KB -tiny_flash_mb_str = fmt(ESP32_FLASH.to(MB).magnitude, precision=0, commas=False) # e.g. "4" MB -tiny_pwr_min_str = f"{ESP32_POWER_MIN.magnitude}" # e.g. "0.1" W -tiny_pwr_max_str = f"{ESP32_POWER_MAX.magnitude}" # e.g. "0.5" W -tiny_price_str = f"{ESP32_PRICE.magnitude}" # e.g. "10" USD + # ┌── 4. OUTPUTS (Formatting) ───────────────────────────────────────────── + edge_mem_gb_str = fmt(DGX_RAM.m_as(GB), precision=0, commas=False) # e.g. "128" GB + edge_stor_tb_str = fmt(DGX_STORAGE.m_as(TB), precision=0, commas=False) # e.g. "4" TB + edge_pwr_w_str = fmt(DGX_POWER.m_as(watt), precision=0, commas=False) # e.g. "500" W + edge_price_min_str = f"{DGX_PRICE_MIN.m_as(USD):,.0f}" # e.g. "3,000" + edge_price_max_str = f"{DGX_PRICE_MAX.m_as(USD):,.0f}" # e.g. "5,000" -# --- Outputs: Decision thresholds --- -cloud_thresh_tflops_str = "1000" # TFLOPS threshold for cloud -cloud_thresh_bw_str = "100" # GB/s memory bandwidth -edge_thresh_pflops_str = "1" # PFLOPS AI compute threshold -edge_thresh_bw_str = "270" # GB/s memory bandwidth -tiny_thresh_tops_str = "1" # TOPS compute threshold -tiny_thresh_mw_str = "1" # mW power threshold + # ┌── 4. OUTPUTS (Formatting) ───────────────────────────────────────────── + tiny_ram_kb_str = fmt(ESP32_RAM.m_as(KiB), precision=0, commas=False) # e.g. "520" KB + tiny_flash_mb_str = fmt(ESP32_FLASH.m_as(MB), precision=0, commas=False) # e.g. "4" MB + tiny_pwr_min_str = f"{ESP32_POWER_MIN.m_as(watt)}" # e.g. "0.1" W + tiny_pwr_max_str = f"{ESP32_POWER_MAX.m_as(watt)}" # e.g. "0.5" W + tiny_price_str = f"{ESP32_PRICE.m_as(USD)}" # e.g. "10" USD + + # ┌── 4. OUTPUTS (Formatting) ───────────────────────────────────────────── + cloud_thresh_tflops_str = "1000" # TFLOPS threshold for cloud + cloud_thresh_bw_str = "100" # GB/s memory bandwidth + edge_thresh_pflops_str = "1" # PFLOPS AI compute threshold + edge_thresh_bw_str = "270" # GB/s memory bandwidth + tiny_thresh_tops_str = "1" # TOPS compute threshold + tiny_thresh_mw_str = "1" # mW power threshold + +# ┌── EXPORTS (Bridge to Text) ───────────────────────────────────────────────── +tpu_chips_str = HardwareSpectrumSetup.tpu_chips_str +cloud_mem_tb_str = HardwareSpectrumSetup.cloud_mem_tb_str +cloud_pwr_mw_str = HardwareSpectrumSetup.cloud_pwr_mw_str +edge_mem_gb_str = HardwareSpectrumSetup.edge_mem_gb_str +edge_stor_tb_str = HardwareSpectrumSetup.edge_stor_tb_str +edge_pwr_w_str = HardwareSpectrumSetup.edge_pwr_w_str +edge_price_min_str = HardwareSpectrumSetup.edge_price_min_str +edge_price_max_str = HardwareSpectrumSetup.edge_price_max_str +tiny_ram_kb_str = HardwareSpectrumSetup.tiny_ram_kb_str +tiny_flash_mb_str = HardwareSpectrumSetup.tiny_flash_mb_str +tiny_pwr_min_str = HardwareSpectrumSetup.tiny_pwr_min_str +tiny_pwr_max_str = HardwareSpectrumSetup.tiny_pwr_max_str +tiny_price_str = HardwareSpectrumSetup.tiny_price_str +cloud_thresh_tflops_str = HardwareSpectrumSetup.cloud_thresh_tflops_str +cloud_thresh_bw_str = HardwareSpectrumSetup.cloud_thresh_bw_str +edge_thresh_pflops_str = HardwareSpectrumSetup.edge_thresh_pflops_str +edge_thresh_bw_str = HardwareSpectrumSetup.edge_thresh_bw_str +tiny_thresh_tops_str = HardwareSpectrumSetup.tiny_thresh_tops_str +tiny_thresh_mw_str = HardwareSpectrumSetup.tiny_thresh_mw_str ``` \index{hardware spectrum!resource progression} @@ -1274,36 +1411,55 @@ above=1of $(B2.north east)!0.5!(B3.north west)$](B0){Cloud ML}; ```{python} #| echo: false #| label: distance-penalty + # ┌───────────────────────────────────────────────────────────────────────────── # │ DISTANCE PENALTY CALCULATION # ├───────────────────────────────────────────────────────────────────────────── -# │ Context: Callout "The Distance Penalty" (Cloud ML section) +# │ Context: "The Distance Penalty" callout in +# │ @sec-ml-systems-cloud-ml-tradeoffs-constraints-96ed # │ -# │ Goal: Demonstrate the physical impossibility of cloud for safety-critical real-time. -# │ Show: That round-trip latency alone can exceed the entire response budget. -# │ How: Calculate speed-of-light RTT for a 1,500km distance. +# │ Goal: Demonstrate why cloud inference is physically impossible for a +# │ 10 ms safety-critical response budget at 1,500 km distance. +# │ Show: That speed-of-light RTT alone (15 ms) already exceeds the 10 ms +# │ budget, leaving a −5 ms deficit before any computation begins. +# │ How: Apply calc_network_latency_ms() using SPEED_OF_LIGHT_FIBER_KM_S; +# │ subtract RTT from budget to get deficit. # │ -# │ Imports: mlsys.constants (SPEED_OF_LIGHT_FIBER_KM_S), mlsys.formulas -# │ Exports: sol_kms_str, rtt_formatted_str, deficit_str, distance_km_str +# │ Imports: mlsys.constants (SPEED_OF_LIGHT_FIBER_KM_S), +# │ mlsys.formulas (calc_network_latency_ms), mlsys.formatting (fmt, check) +# │ Exports: sol_kms_str, rtt_formatted_str, deficit_str, distance_km_str, +# │ safety_budget_str # └───────────────────────────────────────────────────────────────────────────── + from mlsys.constants import SPEED_OF_LIGHT_FIBER_KM_S from mlsys.formulas import calc_network_latency_ms from mlsys.formatting import fmt, check -# --- Inputs (safety-critical scenario) --- -distance_km_value = 1500 # km to cloud datacenter -safety_budget_ms_value = 10 # ms safety requirement +# ┌── P.I.C.O. ISOLATED SCENARIO ─────────────────────────────────────────────── +class DistancePenalty: + """Namespace for Distance Penalty.""" -# --- Process (light-speed round-trip) --- -round_trip_ms_value = calc_network_latency_ms(distance_km_value) -deficit_ms_value = safety_budget_ms_value - round_trip_ms_value + # ┌── 1. PARAMETERS (Inputs) ────────────────────────────────────────────── + distance_km_value = 1500 # km to cloud datacenter + safety_budget_ms_value = 10 # ms safety requirement -# --- Outputs (formatted strings for prose) --- -sol_kms_str = f"{SPEED_OF_LIGHT_FIBER_KM_S.magnitude:,.0f}" # e.g. "200,000" km/s -rtt_formatted_str = fmt(round_trip_ms_value, precision=0, commas=False) # e.g. "15" ms -deficit_str = fmt(deficit_ms_value, precision=0, commas=False) # e.g. "-5" ms -distance_km_str = f"{distance_km_value:,}" # e.g. "1,500" km -safety_budget_str = f"{safety_budget_ms_value}" # "10" ms + # ┌── 2. CALCULATION (The Physics) ──────────────────────────────────────── + round_trip_ms_value = calc_network_latency_ms(distance_km_value) + deficit_ms_value = safety_budget_ms_value - round_trip_ms_value + + # ┌── 4. OUTPUTS (Formatting) ───────────────────────────────────────────── + sol_kms_str = f"{SPEED_OF_LIGHT_FIBER_KM_S.m_as('km/s'):,.0f}" # e.g. "200,000" km/s + rtt_formatted_str = fmt(round_trip_ms_value, precision=0, commas=False) # e.g. "15" ms + deficit_str = fmt(deficit_ms_value, precision=0, commas=False) # e.g. "-5" ms + distance_km_str = f"{distance_km_value:,}" # e.g. "1,500" km + safety_budget_str = f"{safety_budget_ms_value}" # "10" ms + +# ┌── EXPORTS (Bridge to Text) ───────────────────────────────────────────────── +sol_kms_str = DistancePenalty.sol_kms_str +rtt_formatted_str = DistancePenalty.rtt_formatted_str +deficit_str = DistancePenalty.deficit_str +distance_km_str = DistancePenalty.distance_km_str +safety_budget_str = DistancePenalty.safety_budget_str ``` ::: {.callout-notebook title="The Distance Penalty"} @@ -1358,15 +1514,22 @@ Cost management introduces operational complexity requiring total cost of owners # ┌───────────────────────────────────────────────────────────────────────────── # │ CLOUD VS. EDGE TOTAL COST OF OWNERSHIP (TCO) # ├───────────────────────────────────────────────────────────────────────────── -# │ Context: Callout "Cloud vs. Edge TCO" (Cloud ML Trade-offs section) +# │ Context: "Cloud vs. Edge TCO" worked example callout in +# │ @sec-ml-systems-cloud-ml-tradeoffs-constraints-96ed # │ -# │ Goal: Compare Total Cost of Ownership between Cloud and Edge. -# │ Show: The 45% savings of Edge at high volume and its 60% labor dominance. -# │ How: Model CapEx, OpEx, and egress costs over a 3-year lifespan. +# │ Goal: Compute and compare 3-year annualized TCO for cloud (AWS A10G) vs. +# │ on-premises edge (GenericServer) serving 1 million requests/day. +# │ Show: That edge saves ~45% at this volume, but labor (~60% of edge cost) +# │ dominates — making "minimize compute" a misleading optimization target. +# │ How: Model cloud CapEx (GPU hours + egress + load balancer + logs) and +# │ edge CapEx/OpEx (amortized hardware + power + cooling + fiber + labor) +# │ using HOURS_PER_YEAR, CLOUD_EGRESS_PER_GB, and CLOUD_ELECTRICITY_PER_KWH. # │ # │ Imports: mlsys.constants (DAYS_PER_YEAR, HOURS_PER_YEAR, CLOUD_EGRESS_PER_GB, -# │ SERVER_POWER_W, CLOUD_ELECTRICITY_PER_KWH), mlsys.formatting (fmt) -# │ Exports: c_*_str (cloud costs), e_*_str (edge costs), edge_savings_str +# │ CLOUD_ELECTRICITY_PER_KWH, USD, GB, watt, ureg, MILLION, MIB_TO_BYTES) +# │ Exports: c_gpu_str, c_egress_str, c_lb_str, c_logs_str, c_total_str, +# │ e_capex_str, e_power_str, e_cool_str, e_net_str, e_labor_str, +# │ e_total_str, edge_savings_str, labor_pct_str # └───────────────────────────────────────────────────────────────────────────── from mlsys import Hardware from mlsys.constants import ( @@ -1392,7 +1555,7 @@ class CloudEdgeTCO: # Cloud (AWS 2024) gpu_price_per_hr = 0.75 # A10G gpu_instances = 4 - egress_per_gb = CLOUD_EGRESS_PER_GB.to(USD / GB).magnitude + egress_per_gb = CLOUD_EGRESS_PER_GB.m_as(USD / GB) lb_base_per_hr = 0.025 lb_lcu_per_hr = 0.008 avg_lcu = 50 @@ -1401,8 +1564,8 @@ class CloudEdgeTCO: server = Hardware.Edge.GenericServer server_cost = 15000 server_life_years = 3 - power_watts = server.tdp.to(watt).magnitude - electricity_per_kwh = CLOUD_ELECTRICITY_PER_KWH.to(USD / ureg.kilowatt_hour).magnitude + power_watts = server.tdp.m_as(watt) + electricity_per_kwh = CLOUD_ELECTRICITY_PER_KWH.m_as(USD / ureg.kilowatt_hour) cooling_overhead = 0.30 fiber_annual = 1200 devops_fte = 0.1 @@ -1547,15 +1710,23 @@ Economics drive this architecture as much as latency. Attempting to process voic # ┌───────────────────────────────────────────────────────────────────────────── # │ VOICE ASSISTANT WALL: ECONOMICS + INFRASTRUCTURE # ├───────────────────────────────────────────────────────────────────────────── -# │ Context: Callout "The Voice Assistant Wall" (Cloud ML large-scale section) +# │ Context: "The Voice Assistant Wall" callout in +# │ @sec-ml-systems-largescale-training-inference-e16d # │ -# │ Goal: Demonstrate why cloud-only voice processing fails at global scale. -# │ Show: The economic ($500M/year) and bandwidth (32 TB/s) walls. -# │ How: Model global cost and network traffic for 1 billion voice devices. -# │ backbone; even query-only needs 20+ data centers at peak (infra wall). +# │ Goal: Demonstrate why cloud-only voice processing fails at 1-billion-device +# │ scale on both economics and infrastructure grounds simultaneously. +# │ Show: The $500M/year economic wall and the 20+ datacenter infrastructure +# │ wall that emerge from 1B devices × 20 queries/day at 200 ms/query. +# │ How: Calculate total annual cloud cost, GPU-days required, peak datacenter +# │ count (with 3× peak multiplier), and raw audio bandwidth (TB/s). # │ -# │ Imports: mlsys.formatting (fmt), mlsys.constants (BILLION, MILLION) -# │ Exports: ww_*_str (economics), vi_*_str (infrastructure) +# │ Imports: mlsys.constants (BILLION, TRILLION, SEC_PER_HOUR, HOURS_PER_DAY, +# │ BITS_PER_BYTE, KIB_TO_BYTES, MIB_TO_BYTES, MS_PER_SEC) +# │ Exports: ww_devices_b_str, ww_cloud_cost_str, ww_total_cost_str, +# │ ww_edge_power_range_str, ww_edge_cost_str, +# │ vi_devices_str, vi_queries_str, vi_total_queries_str, +# │ vi_gpu_hours_str, vi_datacenters_avg_str, vi_datacenters_peak_str, +# │ vi_audio_kb_str, vi_total_audio_tb_str # └───────────────────────────────────────────────────────────────────────────── from mlsys.constants import ( BILLION, TRILLION, SEC_PER_HOUR, HOURS_PER_DAY, @@ -1773,15 +1944,22 @@ The benefits of lower bandwidth usage and reduced latency become stark when we e # ┌───────────────────────────────────────────────────────────────────────────── # │ BANDWIDTH BOTTLENECK CALCULATION # ├───────────────────────────────────────────────────────────────────────────── -# │ Context: Callout "The Bandwidth Bottleneck" (Edge ML section) +# │ Context: "The Bandwidth Bottleneck" worked example callout in +# │ @sec-ml-systems-edge-ml-benefits-deployment-challenges-b2d0 # │ -# │ Goal: Demonstrate the physical bandwidth wall for raw video streaming. -# │ Show: That 100 HD cameras exceed a 10 Gbps backbone by 5×. -# │ How: Calculate aggregate data rates for 1080p video streams. +# │ Goal: Prove that streaming 100 × 1080p cameras to the cloud is physically +# │ impossible over a 10 Gbps link and economically prohibitive via egress. +# │ Show: That aggregate data rate (≈5 GB/s) exceeds the 10 Gbps line by 5×, +# │ and 24/7 egress costs $M/month — making local edge processing mandatory. +# │ How: Calculate bytes/frame × fps × cameras; compare to Ethernet_10G cap; +# │ use calc_monthly_egress_cost() for the economic wall. # │ -# │ Imports: mlsys.constants (VIDEO_*, CLOUD_EGRESS_PER_GB, NETWORK_10G_BW), -# │ mlsys.formulas (calc_monthly_egress_cost), mlsys.formatting (fmt) -# │ Exports: cam_rate_mbs_str, total_rate_gbs_str, monthly_cost_m_str, etc. +# │ Imports: mlsys.constants (VIDEO_1080P_WIDTH, VIDEO_1080P_HEIGHT, +# │ VIDEO_BYTES_PER_PIXEL_RGB, VIDEO_FPS_STANDARD, CLOUD_EGRESS_PER_GB, +# │ MB, GB, second, MILLION), mlsys.formulas (calc_monthly_egress_cost) +# │ Exports: cam_rate_mbs_str, total_rate_gbs_str, monthly_cost_m_str, +# │ net_cap_gbs_str, bw_short_x_str, num_cameras_str, bb_fps_str, +# │ egress_cost_str, video_width_str, video_height_str, bytes_per_pixel_str # └───────────────────────────────────────────────────────────────────────────── from mlsys import Hardware from mlsys.formulas import calc_monthly_egress_cost @@ -1813,7 +1991,7 @@ class BandwidthBottleneck: total_bytes_per_sec = (num_cameras * bytes_per_sec_single).to("byte/second") network_cap_bytes = network.bandwidth.to("byte/second") - shortfall_ratio = (total_bytes_per_sec / network_cap_bytes).magnitude + shortfall_ratio = (total_bytes_per_sec / network_cap_bytes).m_as('') # Cost (using helper formula) monthly_cost = calc_monthly_egress_cost(total_bytes_per_sec, CLOUD_EGRESS_PER_GB) @@ -1823,15 +2001,15 @@ class BandwidthBottleneck: check(shortfall_ratio >= 2, f"Shortfall ({shortfall_ratio:.1f}x) is too small to be a 'crisis'.") # ┌── 4. OUTPUTS (Formatting) ────────────────────────────────────────────── - cam_rate_mbs_str = fmt(bytes_per_sec_single.to(MB/second).magnitude, precision=0, commas=False) - total_rate_gbs_str = fmt(total_bytes_per_sec.to(GB/second).magnitude, precision=1, commas=False) + cam_rate_mbs_str = fmt(bytes_per_sec_single.m_as(MB/second), precision=0, commas=False) + total_rate_gbs_str = fmt(total_bytes_per_sec.m_as(GB/second), precision=1, commas=False) monthly_cost_m_str = fmt(monthly_cost / MILLION, precision=1, commas=False) - net_cap_gbs_str = fmt(network.bandwidth.to(GB/second).magnitude, precision=2, commas=False) + net_cap_gbs_str = fmt(network.bandwidth.m_as(GB/second), precision=2, commas=False) bw_short_x_str = fmt(shortfall_ratio, precision=0, commas=False) num_cameras_str = f"{num_cameras}" - bb_fps_str = f"{int(fps.magnitude)}" - egress_cost_str = f"{CLOUD_EGRESS_PER_GB.magnitude}" + bb_fps_str = f"{int(fps.m_as('Hz'))}" + egress_cost_str = f"{CLOUD_EGRESS_PER_GB.m_as(USD / GB)}" video_width_str = fmt(width, precision=0, commas=False) video_height_str = fmt(height, precision=0, commas=False) bytes_per_pixel_str = fmt(bpp, precision=0, commas=False) @@ -1973,20 +2151,26 @@ To make these trade-offs concrete, the following worked example applies *edge in # ┌───────────────────────────────────────────────────────────────────────────── # │ EDGE INFERENCE SIZING: RETAIL DEPLOYMENT # ├───────────────────────────────────────────────────────────────────────────── -# │ Context: Callout "Edge Inference Sizing" — hardware selection for retail chain +# │ Context: "Edge Inference Sizing" worked example callout in +# │ @sec-ml-systems-realtime-industrial-iot-systems-373a # │ -# │ Goal: Select cost-optimal hardware for a large-scale edge deployment. -# │ Show: That right-sized edge devices (Coral) outperform workstation-class hardware in TCO. -# │ How: Calculate aggregate TFLOPS requirements and model 3-year fleet costs. +# │ Goal: Select the cost-optimal edge accelerator for a 500-store YOLOv8 Nano +# │ deployment running 20 cameras at 15 FPS per store. +# │ Show: That right-sized purpose-built accelerators (Coral at 4 TOPS/W) yield +# │ lower 3-year fleet TCO than over-provisioned workstation-class hardware. +# │ How: Compute sustained GFLOPS requirement from inference rate × model FLOPs; +# │ apply calc_fleet_tco() with hardware TDP and CLOUD_ELECTRICITY_PER_KWH. # │ -# │ Imports: mlsys.constants (YOLOV8_NANO_FLOPs, GFLOPs, CLOUD_ELECTRICITY_PER_KWH, -# │ HOURS_PER_YEAR), mlsys.formulas (calc_fleet_tco) +# │ Imports: mlsys.constants (GFLOPs, TFLOPs, CLOUD_ELECTRICITY_PER_KWH, +# │ HOURS_PER_YEAR, USD, watt, ureg), mlsys.formulas (calc_fleet_tco) # │ Exports: stores_str, cameras_per_store_str, fps_str, inf_per_sec_str, -# │ yolo_gflops_str, sustained_gf_str, req_tflops_str, coral_*_str, -# │ jetson_*_str, nuc_*_str, coral_tco_k_str, years_str, etc. +# │ yolo_gflops_str, sustained_gf_str, req_tflops_str, coral_tops_str, +# │ coral_power_w_str, coral_tco_k_str, jetson_tops_str, jetson_power_w_str, +# │ jetson_tco_k_str, nuc_tops_str, nuc_power_w_str, nuc_tco_k_str, +# │ power_ratio_str, elec_cost_str, years_str # └───────────────────────────────────────────────────────────────────────────── from mlsys import Hardware, Models -from mlsys.constants import GFLOPs, CLOUD_ELECTRICITY_PER_KWH, HOURS_PER_YEAR, TFLOPs +from mlsys.constants import GFLOPs, CLOUD_ELECTRICITY_PER_KWH, HOURS_PER_YEAR, TFLOPs, USD, watt, ureg from mlsys.formulas import calc_fleet_tco from mlsys.formatting import fmt, check @@ -2024,8 +2208,8 @@ class EdgeSizing: # YOLOv8 Nano Inference FLOPs from Models Twin yolo_flops = model.inference_flops if model.inference_flops else model.training_ops - sustained_gflops = (inf_per_sec * yolo_flops).to(GFLOPs).magnitude - required_tflops = (sustained_gflops * headroom * GFLOPs).to(TFLOPs).magnitude + sustained_gflops = (inf_per_sec * yolo_flops).m_as(GFLOPs) + required_tflops = (sustained_gflops * headroom * GFLOPs).m_as(TFLOPs) # TCO coral_tco = calc_fleet_tco(coral_cost, coral.tdp, stores, years, CLOUD_ELECTRICITY_PER_KWH) @@ -2036,7 +2220,7 @@ class EdgeSizing: coral_power_opex = coral_tco - coral_fleet_capex # ┌── 3. INVARIANTS (Guardrails) ─────────────────────────────────────────── - if required_tflops > coral.peak_flops.to(TFLOPs/second).magnitude: + if required_tflops > coral.peak_flops.m_as(TFLOPs/second): # Note: Coral is 4 TOPS (INT8). YOLO is FP32/INT8? # The original code used 4 TOPS vs 2 TFLOPS required. pass @@ -2048,21 +2232,21 @@ class EdgeSizing: headroom_str = f"{headroom:.0f}" inf_per_sec_str = f"{inf_per_sec}" - yolo_gflops_str = fmt(yolo_flops.to(GFLOPs).magnitude, precision=1) + yolo_gflops_str = fmt(yolo_flops.m_as(GFLOPs), precision=1) sustained_gf_str = fmt(sustained_gflops, precision=0) req_tflops_str = fmt(required_tflops, precision=0) coral_cost_str = f"{coral_cost}" - coral_power_w_str = f"{coral.tdp.magnitude:.0f}" - coral_tops_str = f"{coral.peak_flops.to(TFLOPs/second).magnitude:.0f}" + coral_power_w_str = f"{coral.tdp.m_as(watt):.0f}" + coral_tops_str = f"{coral.peak_flops.m_as(TFLOPs/second):.0f}" jetson_cost_str = f"{jetson_cost}" jetson_power_range_str = "10-40" - jetson_tops_str = f"{jetson.peak_flops.to(TFLOPs/second).magnitude:.0f}" + jetson_tops_str = f"{jetson.peak_flops.m_as(TFLOPs/second):.0f}" nuc_cost_str = f"{nuc_cost}" - nuc_power_w_str = f"{nuc.tdp.magnitude:.0f}" - nuc_tops_str = f"{nuc.peak_flops.to(TFLOPs/second).magnitude:.0f}" + nuc_power_w_str = f"{nuc.tdp.m_as(watt):.0f}" + nuc_tops_str = f"{nuc.peak_flops.m_as(TFLOPs/second):.0f}" coral_fleet_k_str = fmt(coral_fleet_capex / 1000, precision=0) coral_tco_k_str = fmt(coral_tco / 1000, precision=0) @@ -2078,8 +2262,8 @@ class EdgeSizing: hours_per_year_str = f"{HOURS_PER_YEAR}" coral_power_cost_k_str = fmt(coral_power_opex / 1000, precision=1) - power_ratio_str = fmt(jetson.tdp.magnitude / coral.tdp.magnitude, precision=0, commas=False) - elec_cost_str = f"{CLOUD_ELECTRICITY_PER_KWH.magnitude}" + power_ratio_str = fmt(jetson.tdp.m_as(watt) / coral.tdp.m_as(watt), precision=0, commas=False) + elec_cost_str = f"{CLOUD_ELECTRICITY_PER_KWH.m_as(USD / ureg.kilowatt_hour)}" # Cloud alternative: 500 stores each need ~1 GPU instance at $0.75/hr (A10G on-demand) cloud_gpu_price_per_hr = 0.75 @@ -2306,13 +2490,13 @@ class BatteryTax: daily_budget_pct = (power_draw * runtime_hours) / battery_wh * 100 # ┌── 3. INVARIANTS (Guardrails) ─────────────────────────────────────────── - check(runtime_hours.magnitude <= 24, f"Always-on ML should drain battery fast, but got {runtime_hours:.1f} hours.") + check(runtime_hours.m_as(ureg.hour) <= 24, f"Always-on ML should drain battery fast, but got {runtime_hours:.1f} hours.") # ┌── 4. OUTPUTS (Formatting) ────────────────────────────────────────────── - runtime_str = fmt(runtime_hours.magnitude, precision=1, commas=False) - pwr_w_str = fmt(power_draw.to(ureg.watt).magnitude, precision=0, commas=False) - batt_wh_str = fmt(battery_wh.magnitude, precision=0, commas=False) - budget_pct_str = fmt(daily_budget_pct.magnitude, precision=0, commas=False) + runtime_str = fmt(runtime_hours.m_as(ureg.hour), precision=1, commas=False) + pwr_w_str = fmt(power_draw.m_as(ureg.watt), precision=0, commas=False) + batt_wh_str = fmt(battery_wh.m_as(ureg.Wh), precision=0, commas=False) + budget_pct_str = fmt(daily_budget_pct.m_as(''), precision=0, commas=False) runtime_frac = md_frac(f"{batt_wh_str} Wh", f"{pwr_w_str} W", f"**{runtime_str} hours**") @@ -2341,6 +2525,7 @@ The battery constraint limits total energy consumption over time. However, even ```{python} #| label: thermal-quant-calc #| echo: false + # ┌───────────────────────────────────────────────────────────────────────────── # │ THERMAL WALL: QUANTIZATION POWER REDUCTION # ├───────────────────────────────────────────────────────────────────────────── @@ -2353,16 +2538,26 @@ The battery constraint limits total energy consumption over time. However, even # │ Imports: mlsys.formatting (fmt) # │ Exports: baseline_str, quant_power_str, quant_red_str # └───────────────────────────────────────────────────────────────────────────── + from mlsys.formatting import fmt, check -baseline_power_w_value = 12 # W, unoptimized LLM power -quant_reduction_value = 4 # FP32→INT8 power reduction +# ┌── P.I.C.O. ISOLATED SCENARIO ─────────────────────────────────────────────── +class ThermalQuantCalc: + """Namespace for Thermal Quant Calc.""" -quant_power_w_value = baseline_power_w_value / quant_reduction_value # 12W / 4 = 3W + baseline_power_w_value = 12 # W, unoptimized LLM power + quant_reduction_value = 4 # FP32→INT8 power reduction -baseline_str = fmt(baseline_power_w_value, precision=0, commas=False) # e.g. "12" W -quant_power_str = fmt(quant_power_w_value, precision=0, commas=False) # e.g. "3" W -quant_red_str = fmt(quant_reduction_value, precision=0, commas=False) # e.g. "4" × + quant_power_w_value = baseline_power_w_value / quant_reduction_value # 12W / 4 = 3W + + baseline_str = fmt(baseline_power_w_value, precision=0, commas=False) # e.g. "12" W + quant_power_str = fmt(quant_power_w_value, precision=0, commas=False) # e.g. "3" W + quant_red_str = fmt(quant_reduction_value, precision=0, commas=False) # e.g. "4" × + +# ┌── EXPORTS (Bridge to Text) ───────────────────────────────────────────────── +baseline_str = ThermalQuantCalc.baseline_str +quant_power_str = ThermalQuantCalc.quant_power_str +quant_red_str = ThermalQuantCalc.quant_red_str ``` ::: {.callout-notebook title="The Thermal Wall"} @@ -2503,15 +2698,15 @@ class EnergyInference: e_mobilenet_str = "~50 mJ" e_kws_str = "~10 µJ" - q_gpt4_str = fmt(q_gpt4.magnitude, precision=0, commas=True) - q_resnet_cloud_str = fmt(q_resnet_cloud.magnitude, precision=0, commas=True) - q_resnet_edge_str = fmt(q_resnet_edge.magnitude, precision=0, commas=True) - q_mobilenet_str = fmt(q_mobilenet.magnitude, precision=0, commas=True) + q_gpt4_str = fmt(q_gpt4.m_as(''), precision=0, commas=True) + q_resnet_cloud_str = fmt(q_resnet_cloud.m_as(''), precision=0, commas=True) + q_resnet_edge_str = fmt(q_resnet_edge.m_as(''), precision=0, commas=True) + q_mobilenet_str = fmt(q_mobilenet.m_as(''), precision=0, commas=True) # Use BILLION constant - q_kws_str = fmt(q_kws.magnitude / BILLION, precision=0, commas=False) + " billion" + q_kws_str = fmt(q_kws.m_as('') / BILLION, precision=0, commas=False) + " billion" - batt_cap_mah_str = f"{BATTERY_CAPACITY_MAH.magnitude:.0f}" - batt_volt_str = f"{BATTERY_VOLTAGE_V.magnitude}" + batt_cap_mah_str = f"{BATTERY_CAPACITY_MAH.m_as('mAh'):.0f}" + batt_volt_str = f"{BATTERY_VOLTAGE_V.m_as('V')}" # ┌── EXPORTS (Bridge to Text) ───────────────────────────────────────────────── e_gpt4_str = EnergyInference.e_gpt4_str @@ -2658,6 +2853,7 @@ Each paradigm emerged as a response to specific physical constraints: Cloud ML a ```{python} #| label: paradigms-table #| echo: false + # ┌───────────────────────────────────────────────────────────────────────────── # │ PARADIGMS TABLE: CLOUD VS EDGE VS MOBILE VS TINYML # ├───────────────────────────────────────────────────────────────────────────── @@ -2672,35 +2868,61 @@ Each paradigm emerged as a response to specific physical constraints: Cloud ML a # │ Exports: cloud_*_str, edge_*_str, mobile_*_str, tiny_*_str # └───────────────────────────────────────────────────────────────────────────── -# --- Latency (network + inference time) --- -cloud_lat_str = "100 ms-1000 ms+" # cloud round-trip -edge_lat_str = "10-100 ms" # local network + inference -mobile_lat_str = "5-50 ms" # on-device inference -tiny_lat_str = "1-10 ms" # MCU response time +# ┌── P.I.C.O. ISOLATED SCENARIO ─────────────────────────────────────────────── +class ParadigmsTable: + """Namespace for Paradigms Table.""" -# --- Compute capability --- -cloud_comp_str = "Very High (Multiple GPUs/TPUs)" # kW-class accelerators -edge_comp_str = "High (Edge GPUs)" # 10s-100s W accelerators -mobile_comp_str = "Moderate (Mobile NPUs/GPUs)" # 1-10 W NPUs -tiny_comp_str = "Very Low (MCU/tiny processors)" # mW-class MCUs + # --- Latency (network + inference time) --- + cloud_lat_str = "100 ms-1000 ms+" # cloud round-trip + edge_lat_str = "10-100 ms" # local network + inference + mobile_lat_str = "5-50 ms" # on-device inference + tiny_lat_str = "1-10 ms" # MCU response time -# --- Storage capacity --- -cloud_stor_str = "Unlimited (petabytes+)" # elastic cloud storage -edge_stor_str = "Large (terabytes)" # local SSDs -mobile_stor_str = "Moderate (gigabytes)" # phone flash -tiny_stor_str = "Very Limited (kilobytes-megabytes)" # SRAM/flash + # --- Compute capability --- + cloud_comp_str = "Very High (Multiple GPUs/TPUs)" # kW-class accelerators + edge_comp_str = "High (Edge GPUs)" # 10s-100s W accelerators + mobile_comp_str = "Moderate (Mobile NPUs/GPUs)" # 1-10 W NPUs + tiny_comp_str = "Very Low (MCU/tiny processors)" # mW-class MCUs -# --- Energy consumption --- -cloud_pwr_str = "Very High (kW-MW range)" # data center scale -edge_pwr_str = "High (100 s W)" # edge server scale -mobile_pwr_str = "Moderate (1-10 W)" # phone TDP -tiny_pwr_str = "Very Low (mW range)" # energy harvesting + # --- Storage capacity --- + cloud_stor_str = "Unlimited (petabytes+)" # elastic cloud storage + edge_stor_str = "Large (terabytes)" # local SSDs + mobile_stor_str = "Moderate (gigabytes)" # phone flash + tiny_stor_str = "Very Limited (kilobytes-megabytes)" # SRAM/flash -# --- Cost structure --- -cloud_cost_str = "High ($1000s+/month)" # usage-based cloud -edge_cost_str = "Moderate ($100s-1000s)" # hardware capex -mobile_cost_str = "Low ($0-10s)" # app distribution -tiny_cost_str = "Very Low ($1-10s)" # MCU unit cost + # --- Energy consumption --- + cloud_pwr_str = "Very High (kW-MW range)" # data center scale + edge_pwr_str = "High (100 s W)" # edge server scale + mobile_pwr_str = "Moderate (1-10 W)" # phone TDP + tiny_pwr_str = "Very Low (mW range)" # energy harvesting + + # --- Cost structure --- + cloud_cost_str = "High ($1000s+/month)" # usage-based cloud + edge_cost_str = "Moderate ($100s-1000s)" # hardware capex + mobile_cost_str = "Low ($0-10s)" # app distribution + tiny_cost_str = "Very Low ($1-10s)" # MCU unit cost + +# ┌── EXPORTS (Bridge to Text) ───────────────────────────────────────────────── +cloud_comp_str = ParadigmsTable.cloud_comp_str +cloud_cost_str = ParadigmsTable.cloud_cost_str +cloud_lat_str = ParadigmsTable.cloud_lat_str +cloud_pwr_str = ParadigmsTable.cloud_pwr_str +cloud_stor_str = ParadigmsTable.cloud_stor_str +edge_comp_str = ParadigmsTable.edge_comp_str +edge_cost_str = ParadigmsTable.edge_cost_str +edge_lat_str = ParadigmsTable.edge_lat_str +edge_pwr_str = ParadigmsTable.edge_pwr_str +edge_stor_str = ParadigmsTable.edge_stor_str +mobile_comp_str = ParadigmsTable.mobile_comp_str +mobile_cost_str = ParadigmsTable.mobile_cost_str +mobile_lat_str = ParadigmsTable.mobile_lat_str +mobile_pwr_str = ParadigmsTable.mobile_pwr_str +mobile_stor_str = ParadigmsTable.mobile_stor_str +tiny_comp_str = ParadigmsTable.tiny_comp_str +tiny_cost_str = ParadigmsTable.tiny_cost_str +tiny_lat_str = ParadigmsTable.tiny_lat_str +tiny_pwr_str = ParadigmsTable.tiny_pwr_str +tiny_stor_str = ParadigmsTable.tiny_stor_str ``` The resulting fourteen-dimension comparison appears in @tbl-big_vs_tiny: @@ -3250,6 +3472,7 @@ A related misconception holds that moving computation closer to the user always ```{python} #| label: mobile-power-fallacy-calc #| echo: false + # ┌───────────────────────────────────────────────────────────────────────────── # │ MOBILE POWER FALLACY: BATTERY DEPLETION CALCULATIONS # ├───────────────────────────────────────────────────────────────────────────── @@ -3263,21 +3486,32 @@ A related misconception holds that moving computation closer to the user always # │ Exports: low_power_hours_str, high_power_hours_str, # │ low_power_frac, high_power_frac # └───────────────────────────────────────────────────────────────────────────── + from mlsys.formatting import fmt, check, md_frac -battery_wh_value = 15 # Wh, typical smartphone -low_power_w_value = 1 # W, light inference -high_power_w_value = 5 # W, heavy on-device model +# ┌── P.I.C.O. ISOLATED SCENARIO ─────────────────────────────────────────────── +class MobilePowerFallacyCalc: + """Namespace for Mobile Power Fallacy Calc.""" -low_power_hours_value = battery_wh_value / low_power_w_value # 15 / 1 = 15 hours -high_power_hours_value = battery_wh_value / high_power_w_value # 15 / 5 = 3 hours + battery_wh_value = 15 # Wh, typical smartphone + low_power_w_value = 1 # W, light inference + high_power_w_value = 5 # W, heavy on-device model -low_power_hours_str = fmt(low_power_hours_value, precision=0, commas=False) # "15" -high_power_hours_str = fmt(high_power_hours_value, precision=0, commas=False) # "3" + low_power_hours_value = battery_wh_value / low_power_w_value # 15 / 1 = 15 hours + high_power_hours_value = battery_wh_value / high_power_w_value # 15 / 5 = 3 hours -# --- Inline fractions showing the physics --- -low_power_frac = md_frac(f"{battery_wh_value} Wh", f"{low_power_w_value} W", f"**{low_power_hours_str} hours**") -high_power_frac = md_frac(f"{battery_wh_value} Wh", f"{high_power_w_value} W", f"**{high_power_hours_str} hours**") + low_power_hours_str = fmt(low_power_hours_value, precision=0, commas=False) # "15" + high_power_hours_str = fmt(high_power_hours_value, precision=0, commas=False) # "3" + + # --- Inline fractions showing the physics --- + low_power_frac = md_frac(f"{battery_wh_value} Wh", f"{low_power_w_value} W", f"**{low_power_hours_str} hours**") + high_power_frac = md_frac(f"{battery_wh_value} Wh", f"{high_power_w_value} W", f"**{high_power_hours_str} hours**") + +# ┌── EXPORTS (Bridge to Text) ───────────────────────────────────────────────── +low_power_hours_str = MobilePowerFallacyCalc.low_power_hours_str +high_power_hours_str = MobilePowerFallacyCalc.high_power_hours_str +low_power_frac = MobilePowerFallacyCalc.low_power_frac +high_power_frac = MobilePowerFallacyCalc.high_power_frac ``` **Fallacy:** *Model optimization overcomes mobile device power and thermal limits.* @@ -3296,6 +3530,7 @@ The difference is qualitative, not just quantitative. As @sec-ml-systems-tinyml- ```{python} #| label: tco-pitfall-calc #| echo: false + # ┌───────────────────────────────────────────────────────────────────────────── # │ TCO PITFALL: EDGE VS CLOUD TOTAL COST OF OWNERSHIP # ├───────────────────────────────────────────────────────────────────────────── @@ -3309,29 +3544,43 @@ The difference is qualitative, not just quantitative. As @sec-ml-systems-tinyml- # │ Exports: cloud_compute_str, edge_hw_str, edge_network_str, edge_maint_str, # │ edge_reliability_str, edge_total_str, tco_ratio_str # └───────────────────────────────────────────────────────────────────────────── + from mlsys.formatting import fmt, check -# Cloud costs (monthly) -cloud_compute_value = 2000 # $, inference compute +# ┌── P.I.C.O. ISOLATED SCENARIO ─────────────────────────────────────────────── +class TcoPitfallCalc: + """Namespace for Tco Pitfall Calc.""" -# Edge costs (monthly) -edge_hardware_value = 500 # $, amortized hardware -edge_network_value = 3000 # $, network engineering -edge_maintenance_value = 500 # $, hardware maintenance -edge_reliability_value = 2000 # $, reliability engineering + # Cloud costs (monthly) + cloud_compute_value = 2000 # $, inference compute -edge_total_value = (edge_hardware_value + edge_network_value + - edge_maintenance_value + edge_reliability_value) # $6,000 + # Edge costs (monthly) + edge_hardware_value = 500 # $, amortized hardware + edge_network_value = 3000 # $, network engineering + edge_maintenance_value = 500 # $, hardware maintenance + edge_reliability_value = 2000 # $, reliability engineering -tco_ratio_value = edge_total_value / cloud_compute_value # 3x + edge_total_value = (edge_hardware_value + edge_network_value + + edge_maintenance_value + edge_reliability_value) # $6,000 -cloud_compute_str = fmt(cloud_compute_value, precision=0, commas=True) # "2,000" -edge_hw_str = fmt(edge_hardware_value, precision=0, commas=False) # "500" -edge_network_str = fmt(edge_network_value, precision=0, commas=True) # "3,000" -edge_maint_str = fmt(edge_maintenance_value, precision=0, commas=False) # "500" -edge_reliability_str = fmt(edge_reliability_value, precision=0, commas=True) # "2,000" -edge_total_str = fmt(edge_total_value, precision=0, commas=True) # "6,000" -tco_ratio_str = fmt(tco_ratio_value, precision=0, commas=False) # "3" + tco_ratio_value = edge_total_value / cloud_compute_value # 3x + + cloud_compute_str = fmt(cloud_compute_value, precision=0, commas=True) # "2,000" + edge_hw_str = fmt(edge_hardware_value, precision=0, commas=False) # "500" + edge_network_str = fmt(edge_network_value, precision=0, commas=True) # "3,000" + edge_maint_str = fmt(edge_maintenance_value, precision=0, commas=False) # "500" + edge_reliability_str = fmt(edge_reliability_value, precision=0, commas=True) # "2,000" + edge_total_str = fmt(edge_total_value, precision=0, commas=True) # "6,000" + tco_ratio_str = fmt(tco_ratio_value, precision=0, commas=False) # "3" + +# ┌── EXPORTS (Bridge to Text) ───────────────────────────────────────────────── +cloud_compute_str = TcoPitfallCalc.cloud_compute_str +edge_hw_str = TcoPitfallCalc.edge_hw_str +edge_network_str = TcoPitfallCalc.edge_network_str +edge_maint_str = TcoPitfallCalc.edge_maint_str +edge_reliability_str = TcoPitfallCalc.edge_reliability_str +edge_total_str = TcoPitfallCalc.edge_total_str +tco_ratio_str = TcoPitfallCalc.tco_ratio_str ``` **Pitfall:** *Minimizing computational resources minimizes total cost.* @@ -3341,6 +3590,7 @@ Teams optimize per-unit resource consumption while ignoring operational overhead ```{python} #| label: amdahl-camera-calc #| echo: false + # ┌───────────────────────────────────────────────────────────────────────────── # │ AMDAHL'S LAW: CAMERA PIPELINE EXAMPLE # ├───────────────────────────────────────────────────────────────────────────── @@ -3353,37 +3603,54 @@ Teams optimize per-unit resource consumption while ignoring operational overhead # │ Imports: mlsys.formatting (fmt) # │ Exports: cam_*_str variables for prose # └───────────────────────────────────────────────────────────────────────────── + from mlsys.formatting import fmt, check -# --- Inputs (smartphone camera pipeline stages) --- -cam_isp_ms_value = 100 # ms, ISP + auto-exposure -cam_ml_ms_value = 60 # ms, ML scene classification -cam_post_ms_value = 40 # ms, tone mapping + HDR merge -cam_ml_speedup_value = 10 # 10× faster ML model +# ┌── P.I.C.O. ISOLATED SCENARIO ─────────────────────────────────────────────── +class AmdahlCameraCalc: + """Namespace for Amdahl Camera Calc.""" -# --- Process (Amdahl's Law) --- -cam_total_ms_value = cam_isp_ms_value + cam_ml_ms_value + cam_post_ms_value # 200 ms -cam_ml_frac_value = cam_ml_ms_value / cam_total_ms_value # 0.30 -cam_non_ml_frac_value = 1 - cam_ml_frac_value # 0.70 + # ┌── 1. PARAMETERS (Inputs) ────────────────────────────────────────────── + cam_isp_ms_value = 100 # ms, ISP + auto-exposure + cam_ml_ms_value = 60 # ms, ML scene classification + cam_post_ms_value = 40 # ms, tone mapping + HDR merge + cam_ml_speedup_value = 10 # 10× faster ML model -cam_speedup_10x_value = 1 / (cam_non_ml_frac_value + cam_ml_frac_value / cam_ml_speedup_value) -cam_speedup_inf_value = 1 / cam_non_ml_frac_value # theoretical max -cam_ml_optimized_ms_value = cam_ml_ms_value / cam_ml_speedup_value # 6 ms -cam_total_optimized_ms_value = (cam_isp_ms_value + - cam_ml_optimized_ms_value + - cam_post_ms_value) # 146 ms + # ┌── 2. CALCULATION (The Physics) ──────────────────────────────────────── + cam_total_ms_value = cam_isp_ms_value + cam_ml_ms_value + cam_post_ms_value # 200 ms + cam_ml_frac_value = cam_ml_ms_value / cam_total_ms_value # 0.30 + cam_non_ml_frac_value = 1 - cam_ml_frac_value # 0.70 -# --- Outputs (formatted strings for prose) --- -cam_isp_str = fmt(cam_isp_ms_value, precision=0, commas=False) # "100" -cam_ml_str = fmt(cam_ml_ms_value, precision=0, commas=False) # "60" -cam_post_str = fmt(cam_post_ms_value, precision=0, commas=False) # "40" -cam_total_str = fmt(cam_total_ms_value, precision=0, commas=False) # "200" -cam_ml_pct_str = fmt(cam_ml_frac_value * 100, precision=0, commas=False) # "30" -cam_non_ml_pct_str = fmt(cam_non_ml_frac_value * 100, precision=0, commas=False) # "70" -cam_speedup_10x_str = fmt(cam_speedup_10x_value, precision=2, commas=False) # "1.37" -cam_speedup_inf_str = fmt(cam_speedup_inf_value, precision=2, commas=False) # "1.43" -cam_ml_opt_str = fmt(cam_ml_optimized_ms_value, precision=0, commas=False) # "6" -cam_total_opt_str = fmt(cam_total_optimized_ms_value, precision=0, commas=False) # "146" + cam_speedup_10x_value = 1 / (cam_non_ml_frac_value + cam_ml_frac_value / cam_ml_speedup_value) + cam_speedup_inf_value = 1 / cam_non_ml_frac_value # theoretical max + cam_ml_optimized_ms_value = cam_ml_ms_value / cam_ml_speedup_value # 6 ms + cam_total_optimized_ms_value = (cam_isp_ms_value + + cam_ml_optimized_ms_value + + cam_post_ms_value) # 146 ms + + # ┌── 4. OUTPUTS (Formatting) ───────────────────────────────────────────── + cam_isp_str = fmt(cam_isp_ms_value, precision=0, commas=False) # "100" + cam_ml_str = fmt(cam_ml_ms_value, precision=0, commas=False) # "60" + cam_post_str = fmt(cam_post_ms_value, precision=0, commas=False) # "40" + cam_total_str = fmt(cam_total_ms_value, precision=0, commas=False) # "200" + cam_ml_pct_str = fmt(cam_ml_frac_value * 100, precision=0, commas=False) # "30" + cam_non_ml_pct_str = fmt(cam_non_ml_frac_value * 100, precision=0, commas=False) # "70" + cam_speedup_10x_str = fmt(cam_speedup_10x_value, precision=2, commas=False) # "1.37" + cam_speedup_inf_str = fmt(cam_speedup_inf_value, precision=2, commas=False) # "1.43" + cam_ml_opt_str = fmt(cam_ml_optimized_ms_value, precision=0, commas=False) # "6" + cam_total_opt_str = fmt(cam_total_optimized_ms_value, precision=0, commas=False) # "146" + +# ┌── EXPORTS (Bridge to Text) ───────────────────────────────────────────────── +cam_isp_str = AmdahlCameraCalc.cam_isp_str +cam_ml_opt_str = AmdahlCameraCalc.cam_ml_opt_str +cam_ml_pct_str = AmdahlCameraCalc.cam_ml_pct_str +cam_ml_str = AmdahlCameraCalc.cam_ml_str +cam_non_ml_pct_str = AmdahlCameraCalc.cam_non_ml_pct_str +cam_post_str = AmdahlCameraCalc.cam_post_str +cam_speedup_10x_str = AmdahlCameraCalc.cam_speedup_10x_str +cam_speedup_inf_str = AmdahlCameraCalc.cam_speedup_inf_str +cam_total_opt_str = AmdahlCameraCalc.cam_total_opt_str +cam_total_str = AmdahlCameraCalc.cam_total_str ``` **Fallacy:** *Model optimization translates linearly to system speedup.* diff --git a/book/quarto/contents/vol1/nn_architectures/nn_architectures.qmd b/book/quarto/contents/vol1/nn_architectures/nn_architectures.qmd index 89fb88170..091a502ad 100644 --- a/book/quarto/contents/vol1/nn_architectures/nn_architectures.qmd +++ b/book/quarto/contents/vol1/nn_architectures/nn_architectures.qmd @@ -219,7 +219,7 @@ The quantitative characteristics of these Lighthouse models expose a critical en from mlsys import Hardware, Models from mlsys.constants import ( - A100_MEM_CAPACITY, BYTES_FP32, BYTES_FP16, Mparam, Bparam, Kparam, GFLOPs, MFLOPs, MB, GB, KB, GiB + A100_MEM_CAPACITY, BYTES_FP32, BYTES_FP16, param, Mparam, Bparam, Kparam, GFLOPs, MFLOPs, MB, GB, KB, GiB ) from mlsys.formatting import fmt, check from mlsys.formulas import model_memory @@ -242,35 +242,35 @@ class LighthouseSpecs: # ┌── 2. CALCULATION (The Physics) ───────────────────────────────────────── # ResNet-50 - resnet_params = m_resnet.parameters.to(Mparam).magnitude - resnet_flops = m_resnet.inference_flops.to(GFLOPs).magnitude - resnet_mem_mb = m_resnet.size_in_bytes(BYTES_FP32).to(MB).magnitude + resnet_params = m_resnet.parameters.m_as(Mparam) + resnet_flops = m_resnet.inference_flops.m_as(GFLOPs) + resnet_mem_mb = m_resnet.size_in_bytes(BYTES_FP32).m_as(MB) # GPT-2 XL - gpt2_params = m_gpt2.parameters.to(Bparam).magnitude + gpt2_params = m_gpt2.parameters.m_as(Bparam) gpt2_flops_token = 3.0 # Approximate - gpt2_mem_gb = m_gpt2.size_in_bytes(BYTES_FP32).to(GB).magnitude + gpt2_mem_gb = m_gpt2.size_in_bytes(BYTES_FP32).m_as(GB) # DLRM dlrm_entries_b = 25.0 # 25B entries - dlrm_mem_gb = m_dlrm.model_size.to(GB).magnitude + dlrm_mem_gb = m_dlrm.model_size.m_as(GB) # MobileNetV2 - mobilenet_params = m_mobilenet.parameters.to(Mparam).magnitude - mobilenet_flops = m_mobilenet.inference_flops.to(MFLOPs).magnitude - mobilenet_mem_mb = m_mobilenet.size_in_bytes(BYTES_FP32).to(MB).magnitude + mobilenet_params = m_mobilenet.parameters.m_as(Mparam) + mobilenet_flops = m_mobilenet.inference_flops.m_as(MFLOPs) + mobilenet_mem_mb = m_mobilenet.size_in_bytes(BYTES_FP32).m_as(MB) # KWS (DS-CNN) - kws_params_k = m_kws.parameters.to(Kparam).magnitude - kws_flops_m = m_kws.inference_flops.to(MFLOPs).magnitude - kws_mem_kb = m_kws.size_in_bytes(BYTES_FP32).to(KB).magnitude + kws_params_k = m_kws.parameters.m_as(Kparam) + kws_flops_m = m_kws.inference_flops.m_as(MFLOPs) + kws_mem_kb = m_kws.size_in_bytes(BYTES_FP32).m_as(KB) # Ratios - mobilenet_size_ratio = m_resnet.parameters.magnitude / m_mobilenet.parameters.magnitude - mobilenet_flops_ratio = (m_resnet.inference_flops / m_mobilenet.inference_flops).to('count').magnitude + mobilenet_size_ratio = m_resnet.parameters.m_as(param) / m_mobilenet.parameters.m_as(param) + mobilenet_flops_ratio = (m_resnet.inference_flops / m_mobilenet.inference_flops).m_as('count') # Reference Hardware - a100_mem = hw_a100.memory_capacity.to(GiB).magnitude + a100_mem = hw_a100.memory_capacity.m_as(GiB) # ┌── 3. INVARIANTS (Guardrails) ─────────────────────────────────────────── # Ensure numbers match the book's narrative @@ -288,7 +288,7 @@ class LighthouseSpecs: gpt2_gflops_per_token_str = fmt(gpt2_flops_token, precision=1) # GPT-3 context - gpt3_fp16_gb_str = fmt(Models.GPT3.size_in_bytes(BYTES_FP16).to(GB).magnitude, precision=0) + gpt3_fp16_gb_str = fmt(Models.GPT3.size_in_bytes(BYTES_FP16).m_as(GB), precision=0) dlrm_entries_b_str = fmt(dlrm_entries_b, precision=0) dlrm_model_size_gb_str = fmt(dlrm_mem_gb, precision=0) @@ -490,8 +490,8 @@ class MLPvsCNN: check(ratio >= 10, f"MLP ({mlp_p}) isn't significantly larger than CNN ({cnn_p}). Ratio: {ratio}x") # ┌── 4. OUTPUTS (Formatting) ────────────────────────────────────────────── - mlp_params_str = f"{(mlp_p * param).to(Mparam).magnitude:.0f}M" - cnn_params_str = f"{(cnn_p * param).to(Kparam).magnitude:.0f}K" + mlp_params_str = f"{(mlp_p * param).m_as(Mparam):.0f}M" + cnn_params_str = f"{(cnn_p * param).m_as(Kparam):.0f}K" param_ratio_str = f"{ratio}" # Note: Use MLPvsCNN.mlp_params_str directly. @@ -859,10 +859,10 @@ class A100Specs: # ┌── 1. PARAMETERS (Inputs) ─────────────────────────────────────────────── # A100 performance at various precisions - fp16_tensor = A100_FLOPS_FP16_TENSOR.to(TFLOPs/second).magnitude - int8_tensor = A100_FLOPS_INT8.to(TFLOPs/second).magnitude - fp32_cuda = A100_FLOPS_FP32.to(TFLOPs/second).magnitude - tf32_tensor = A100_FLOPS_TF32.to(TFLOPs/second).magnitude + fp16_tensor = A100_FLOPS_FP16_TENSOR.m_as(TFLOPs/second) + int8_tensor = A100_FLOPS_INT8.m_as(TFLOPs/second) + fp32_cuda = A100_FLOPS_FP32.m_as(TFLOPs/second) + tf32_tensor = A100_FLOPS_TF32.m_as(TFLOPs/second) # ┌── 4. OUTPUTS (Formatting) ────────────────────────────────────────────── a100_tflops_fp16_str = fmt(fp16_tensor, precision=0, commas=False) @@ -2364,17 +2364,27 @@ Attention mechanisms create computational patterns that differ significantly fro # │ Exports: attn_score_macs_m_str # └───────────────────────────────────────────────────────────────────────────── +from mlsys.constants import MILLION from mlsys.formatting import fmt, check -# --- Inputs (typical attention configuration) --- -attn_seq_len_value = 512 # sequence length -attn_head_dim_value = 64 # dimension per head +class AttentionComputeCosts: + """Demonstrate quadratic compute cost of self-attention at sequence length 512.""" -# --- Computation costs --- -attn_score_macs_value = attn_seq_len_value * attn_seq_len_value * attn_head_dim_value + # ┌── 1. PARAMETERS (Inputs) ────────────────────────────────────────────── + seq_len = 512 # sequence length + head_dim = 64 # dimension per head -# --- Outputs (formatted strings for prose) --- -attn_score_macs_m_str = fmt(attn_score_macs_value / MILLION, precision=1, commas=False) # e.g. "16.8" + # ┌── 2. CALCULATION (The Physics) ──────────────────────────────────────── + score_macs = seq_len * seq_len * head_dim + + # ┌── 3. INVARIANTS (Guardrails) ────────────────────────────────────────── + check(score_macs > MILLION, "Attention MACs should exceed 1M for seq_len=512.") + + # ┌── 4. OUTPUTS (Formatting) ───────────────────────────────────────────── + attn_score_macs_m_str = fmt(score_macs / MILLION, precision=1, commas=False) # e.g. "16.8" + +# ┌── EXPORTS (Bridge to Text) ───────────────────────────────────────────────── +attn_score_macs_m_str = AttentionComputeCosts.attn_score_macs_m_str ``` ::: {#lst-attention_layer_compute lst-cap="**Attention Computation**: Two implementations showing the same O(N^2 $\times$ d) complexity. The matrix form (top) uses optimized GEMM, while the nested loops (bottom) expose the quadratic pairwise comparisons: for sequence length 512 and dimension 64, computing attention scores requires 512 $\times$ 512 $\times$ 64 = `{python} attn_score_macs_m_str` million MACs per attention head, plus another `{python} attn_score_macs_m_str`M for value aggregation."} @@ -2471,7 +2481,7 @@ class AttentionMemory: # ┌── 1. PARAMETERS (Inputs) ─────────────────────────────────────────────── seq_len = 100_000 - bytes_per_element = BYTES_FP16.magnitude + bytes_per_element = BYTES_FP16.m_as(byte) num_layers = 32 num_heads = 12 @@ -2886,7 +2896,7 @@ class DLRMEmbedding: # ┌── 2. CALCULATION (The Physics) ───────────────────────────────────────── table_bytes = num_users * embed_dim * bytes_per_param - table_gb = (table_bytes * byte).to(GB).magnitude + table_gb = (table_bytes * byte).m_as(GB) # ┌── 3. INVARIANTS (Guardrails) ─────────────────────────────────────────── check(table_gb >= 80, f"DLRM table ({table_gb:.1f} GB) fits on an A100. It must be larger to justify model parallelism.") @@ -2964,12 +2974,12 @@ class CapacityWall: # ┌── 1. PARAMETERS (Inputs) ─────────────────────────────────────────────── num_items = 100_000_000 embed_dim = 128 - bytes_per_param = BYTES_FP32.magnitude + bytes_per_param = BYTES_FP32.m_as(byte) # ┌── 2. CALCULATION (The Physics) ───────────────────────────────────────── table_bytes = num_items * embed_dim * bytes_per_param - table_gb = (table_bytes * byte).to(GB).magnitude - a100_capacity_gb = A100_MEM_CAPACITY.to(GB).magnitude + table_gb = (table_bytes * byte).m_as(GB) + a100_capacity_gb = A100_MEM_CAPACITY.m_as(GB) utilization_pct = (table_gb / a100_capacity_gb) * 100 # ┌── 4. OUTPUTS (Formatting) ────────────────────────────────────────────── @@ -3166,13 +3176,27 @@ Recall the plain 50-layer network from the analysis above: loss stuck at 1.8, on from mlsys.formatting import fmt, check -# --- Empirical overhead measurements --- -skip_memory_overhead_pct_value = 20 # activation storage -skip_epoch_cost_pct_value = 10 # per-epoch compute +class ResNetSkipOverhead: + """Quantify systems cost of residual connections: ~20% memory overhead.""" -# --- Outputs (formatted strings for prose) --- -skip_memory_overhead_pct_str = fmt(skip_memory_overhead_pct_value, precision=0, commas=False) # e.g. "20" -skip_epoch_cost_pct_str = fmt(skip_epoch_cost_pct_value, precision=0, commas=False) # e.g. "10" + # ┌── 1. PARAMETERS (Inputs) ────────────────────────────────────────────── + memory_overhead_pct = 20 # activation storage + epoch_cost_pct = 10 # per-epoch compute + + # ┌── 2. CALCULATION (The Physics) ──────────────────────────────────────── + # Values are empirical anchors; no derived calculation needed. + + # ┌── 3. INVARIANTS (Guardrails) ────────────────────────────────────────── + check(0 < memory_overhead_pct < 100, "Memory overhead must be a valid percentage.") + check(0 < epoch_cost_pct < 100, "Epoch cost must be a valid percentage.") + + # ┌── 4. OUTPUTS (Formatting) ───────────────────────────────────────────── + skip_memory_overhead_pct_str = fmt(memory_overhead_pct, precision=0, commas=False) # e.g. "20" + skip_epoch_cost_pct_str = fmt(epoch_cost_pct, precision=0, commas=False) # e.g. "10" + +# ┌── EXPORTS (Bridge to Text) ───────────────────────────────────────────────── +skip_memory_overhead_pct_str = ResNetSkipOverhead.skip_memory_overhead_pct_str +skip_epoch_cost_pct_str = ResNetSkipOverhead.skip_epoch_cost_pct_str ``` While skip connections solve gradient flow, they introduce system-level costs. Memory overhead increases because skip connections require storing the input to each residual block for the addition operation during the forward pass and for backpropagation. For a ResNet-50 with batch size 32 processing $224 \times 224$ RGB images, this adds approximately `{python} skip_memory_overhead_pct_str`% memory overhead compared to a plain network. The computational cost of the addition operation ($y = \mathcal{F}(x) + x$) is computationally trivial, adding negligible compute time. The primary cost is the residual function $\mathcal{F}(x)$ itself. @@ -3654,16 +3678,29 @@ Energy consumption patterns vary dramatically across neural network architecture # │ Exports: energy_mac_pj_str, energy_dram_str # └───────────────────────────────────────────────────────────────────────────── -from mlsys.constants import ENERGY_DRAM_ACCESS_PJ +from mlsys.constants import ENERGY_DRAM_ACCESS_PJ, ureg from mlsys.formatting import fmt, check -# --- Energy costs (from Horowitz 2014) --- -energy_mac_pj_value = 4.6 # pJ per MAC (45nm) -energy_dram_value = ENERGY_DRAM_ACCESS_PJ.magnitude # pJ per 32-bit access +class EnergyConsumptionAnalysis: + """Contrast energy cost of compute vs. data movement: DRAM access is ~5x more costly.""" -# --- Outputs (formatted strings for prose) --- -energy_mac_pj_str = f"{energy_mac_pj_value}" # e.g. "4.6" -energy_dram_str = fmt(energy_dram_value, precision=0, commas=False) # e.g. "26" + # ┌── 1. PARAMETERS (Inputs) ────────────────────────────────────────────── + mac_pj = 4.6 # pJ per MAC (Horowitz 2014, 45nm) + dram_pj = ENERGY_DRAM_ACCESS_PJ.m_as(ureg.picojoule) # pJ per 32-bit access + + # ┌── 2. CALCULATION (The Physics) ──────────────────────────────────────── + dram_to_mac_ratio = dram_pj / mac_pj + + # ┌── 3. INVARIANTS (Guardrails) ────────────────────────────────────────── + check(dram_to_mac_ratio > 1, "DRAM access must cost more energy than a MAC.") + + # ┌── 4. OUTPUTS (Formatting) ───────────────────────────────────────────── + energy_mac_pj_str = f"{mac_pj}" # e.g. "4.6" + energy_dram_str = fmt(dram_pj, precision=0, commas=False) # e.g. "26" + +# ┌── EXPORTS (Bridge to Text) ───────────────────────────────────────────────── +energy_mac_pj_str = EnergyConsumptionAnalysis.energy_mac_pj_str +energy_dram_str = EnergyConsumptionAnalysis.energy_dram_str ``` Dense matrix operations in MLPs achieve excellent arithmetic intensity[^fn-arithmetic-intensity-dnn] (computation per data movement) but consume significant absolute energy. Each multiply-accumulate operation consumes approximately `{python} energy_mac_pj_str` pJ, while data movement from DRAM costs `{python} energy_dram_str` pJ per 32-bit value [@horowitz2014computing]. Given this energy ratio, typical MLP inference spends the majority of its energy budget on data movement rather than computation, making memory bandwidth optimization critical for energy efficiency. @@ -3745,17 +3782,29 @@ CNNs benefit from specialized convolution algorithms and data layout optimizatio from mlsys.formatting import fmt, check -# --- Standard vs Winograd multiply counts for 3x3 conv --- -std_muls_3x3_value = 9 # 3x3 = 9 muls -winograd_muls_value = 4 # Winograd F(2,3) +class WinogradCalc: + """Demonstrate 2.25x multiplication reduction of Winograd F(2,3) vs standard 3x3 conv.""" -# --- Reduction ratio --- -winograd_reduction_value = std_muls_3x3_value / winograd_muls_value + # ┌── 1. PARAMETERS (Inputs) ────────────────────────────────────────────── + std_muls_3x3 = 9 # 3x3 = 9 multiplies + winograd_muls = 4 # Winograd F(2,3) multiplies -# --- Outputs (formatted strings for prose) --- -winograd_reduction_str = fmt(winograd_reduction_value, precision=2, commas=False) # e.g. "2.25" -std_muls_3x3_str = f"{std_muls_3x3_value}" # e.g. "9" -winograd_muls_str = f"{winograd_muls_value}" # e.g. "4" + # ┌── 2. CALCULATION (The Physics) ──────────────────────────────────────── + winograd_reduction = std_muls_3x3 / winograd_muls + + # ┌── 3. INVARIANTS (Guardrails) ────────────────────────────────────────── + check(winograd_reduction > 1, "Winograd must reduce multiply count.") + check(abs(winograd_reduction - 2.25) < 0.01, "Winograd F(2,3) must yield 2.25x reduction.") + + # ┌── 4. OUTPUTS (Formatting) ───────────────────────────────────────────── + winograd_reduction_str = fmt(winograd_reduction, precision=2, commas=False) # e.g. "2.25" + std_muls_3x3_str = f"{std_muls_3x3}" # e.g. "9" + winograd_muls_str = f"{winograd_muls}" # e.g. "4" + +# ┌── EXPORTS (Bridge to Text) ───────────────────────────────────────────────── +winograd_reduction_str = WinogradCalc.winograd_reduction_str +std_muls_3x3_str = WinogradCalc.std_muls_3x3_str +winograd_muls_str = WinogradCalc.winograd_muls_str ``` [^fn-winograd]: **Winograd Algorithms**\index{Winograd Algorithm}: Fast convolution algorithms based on Shmuel Winograd's 1980 work on minimal multiplication complexity. For 3 $\times$ 3 convolutions, Winograd reduces multiply operations from `{python} std_muls_3x3_str` to `{python} winograd_muls_str` per output (`{python} winograd_reduction_str` $\times$ reduction) by trading multiplications for additions, which cost less in terms of both latency and energy. Modern deep learning frameworks like cuDNN automatically select Winograd for appropriate layer configurations, though numerical precision degradation at FP16 limits applicability for mixed-precision training. @@ -3883,32 +3932,50 @@ This section synthesizes the chapter's concepts through a complete architecture from mlsys.formatting import fmt, check from mlsys.constants import RESNET50_FLOPs, GFLOPs, TFLOPs -# --- Inputs (real-time video processing) --- -tc_fps_value = 30 # target frame rate -tc_midrange_gpu_tflops_value = 10 # reference mid-range GPU -tc_objdet_gflops_value = 100 # object detection model +class ThroughputCeilingCalc: + """Evaluate real-time vision feasibility: ResNet-50 at 30 FPS leaves ample headroom.""" -# --- Computation --- -tc_resnet_gflops_value = RESNET50_FLOPs.to(GFLOPs).magnitude -tc_sustained_gflops_value = tc_fps_value * tc_resnet_gflops_value -tc_effective_tflops_low_value = tc_midrange_gpu_tflops_value * 0.50 # 50% utilization -tc_effective_tflops_high_value = tc_midrange_gpu_tflops_value * 0.60 # 60% utilization -tc_headroom_value = tc_effective_tflops_low_value * 1000 / tc_sustained_gflops_value + # ┌── 1. PARAMETERS (Inputs) ────────────────────────────────────────────── + fps = 30 # target frame rate + midrange_gpu_tflops = 10 # reference mid-range GPU (TFLOPS) + objdet_gflops = 100 # object detection model (GFLOPs) -tc_objdet_sustained_value = (tc_fps_value * tc_objdet_gflops_value * GFLOPs).to(TFLOPs).magnitude -tc_objdet_headroom_value = tc_effective_tflops_low_value / tc_objdet_sustained_value + # ┌── 2. CALCULATION (The Physics) ──────────────────────────────────────── + resnet_gflops = RESNET50_FLOPs.m_as(GFLOPs) + sustained_gflops = fps * resnet_gflops + effective_tflops_low = midrange_gpu_tflops * 0.50 # 50% utilization + effective_tflops_high = midrange_gpu_tflops * 0.60 # 60% utilization + headroom = effective_tflops_low * 1000 / sustained_gflops -# --- Outputs (formatted strings for prose) --- -tc_fps_str = f"{tc_fps_value}" # e.g. "30" -tc_resnet_gflops_str = fmt(tc_resnet_gflops_value, precision=0, commas=False) # e.g. "4" -tc_sustained_gflops_str = fmt(tc_sustained_gflops_value, precision=0, commas=False) # e.g. "123" -tc_gpu_tflops_str = f"{tc_midrange_gpu_tflops_value}" # e.g. "10" -tc_effective_low_str = fmt(tc_effective_tflops_low_value, precision=0, commas=False) # e.g. "5" -tc_effective_high_str = fmt(tc_effective_tflops_high_value, precision=0, commas=False) # e.g. "6" -tc_headroom_str = fmt(tc_headroom_value, precision=0, commas=False) # e.g. "41" -tc_objdet_gflops_str = f"{tc_objdet_gflops_value}" # e.g. "100" -tc_objdet_sustained_str = fmt(tc_objdet_sustained_value, precision=0, commas=False) # e.g. "3" -tc_objdet_headroom_str = fmt(tc_objdet_headroom_value, precision=0, commas=False) # e.g. "2" + objdet_sustained_tflops = (fps * objdet_gflops * GFLOPs).m_as(TFLOPs) + objdet_headroom = effective_tflops_low / objdet_sustained_tflops + + # ┌── 3. INVARIANTS (Guardrails) ────────────────────────────────────────── + check(headroom > 1, "ResNet-50 at 30 FPS must leave compute headroom on a mid-range GPU.") + + # ┌── 4. OUTPUTS (Formatting) ───────────────────────────────────────────── + tc_fps_str = f"{fps}" # e.g. "30" + tc_resnet_gflops_str = fmt(resnet_gflops, precision=0, commas=False) # e.g. "4" + tc_sustained_gflops_str = fmt(sustained_gflops, precision=0, commas=False) # e.g. "123" + tc_gpu_tflops_str = f"{midrange_gpu_tflops}" # e.g. "10" + tc_effective_low_str = fmt(effective_tflops_low, precision=0, commas=False) # e.g. "5" + tc_effective_high_str = fmt(effective_tflops_high, precision=0, commas=False) # e.g. "6" + tc_headroom_str = fmt(headroom, precision=0, commas=False) # e.g. "41" + tc_objdet_gflops_str = f"{objdet_gflops}" # e.g. "100" + tc_objdet_sustained_str = fmt(objdet_sustained_tflops, precision=0, commas=False) # e.g. "3" + tc_objdet_headroom_str = fmt(objdet_headroom, precision=0, commas=False) # e.g. "2" + +# ┌── EXPORTS (Bridge to Text) ───────────────────────────────────────────────── +tc_fps_str = ThroughputCeilingCalc.tc_fps_str +tc_resnet_gflops_str = ThroughputCeilingCalc.tc_resnet_gflops_str +tc_sustained_gflops_str = ThroughputCeilingCalc.tc_sustained_gflops_str +tc_gpu_tflops_str = ThroughputCeilingCalc.tc_gpu_tflops_str +tc_effective_low_str = ThroughputCeilingCalc.tc_effective_low_str +tc_effective_high_str = ThroughputCeilingCalc.tc_effective_high_str +tc_headroom_str = ThroughputCeilingCalc.tc_headroom_str +tc_objdet_gflops_str = ThroughputCeilingCalc.tc_objdet_gflops_str +tc_objdet_sustained_str = ThroughputCeilingCalc.tc_objdet_sustained_str +tc_objdet_headroom_str = ThroughputCeilingCalc.tc_objdet_headroom_str ``` ::: {.callout-notebook title="The Throughput Ceiling"} @@ -3944,50 +4011,68 @@ tc_objdet_headroom_str = fmt(tc_objdet_headroom_value, precision=0, commas=False from mlsys.formatting import fmt, check from mlsys.constants import KWS_DSCNN_PARAMS, KWS_DSCNN_FLOPs, Kparam, MFLOPs -# --- MobileNetV1 specs --- -mnv1_params_m_value = 4.2 # millions of params -mnv1_flops_mflops_value = 569 # MFLOPs at 224x224 +class WildlifeModelSizing: + """Select model architecture for constrained edge deployment: MobileNetV2 fits 512 MB.""" -# --- MobileNetV2 (0.75x width) specs --- -mnv2_params_m_value = 2.2 # millions of params -mnv2_flops_mflops_value = 150 # MFLOPs at 224x224 + # ┌── 1. PARAMETERS (Inputs) ────────────────────────────────────────────── + # MobileNetV1 specs + mnv1_params_m = 4.2 # millions of params + mnv1_flops_mflops = 569 # MFLOPs at 224x224 -# --- Edge deployment power assumptions --- -inference_power_mw_value = 200 # milliwatts during inference -inference_latency_ms_value = 75 # ms per inference -inferences_per_day_value = 100 # trigger-based + # MobileNetV2 (0.75x width) specs + mnv2_params_m = 2.2 # millions of params + mnv2_flops_mflops = 150 # MFLOPs at 224x224 -# --- Memory calculations --- -mnv1_fp32_mb_value = mnv1_params_m_value * 4 # FP32: 4 bytes/param -mnv1_int8_mb_value = mnv1_params_m_value * 1 # INT8: 1 byte/param -mnv2_fp32_mb_value = mnv2_params_m_value * 4 -mnv2_int8_mb_value = mnv2_params_m_value * 1 + # Edge deployment power assumptions + inference_power_mw = 200 # milliwatts during inference + inference_latency_ms = 75 # ms per inference + inferences_per_day = 100 # trigger-based -# --- KWS reference (too small for 50-species task) --- -kws_example_params_k_value = KWS_DSCNN_PARAMS.to(Kparam).magnitude -kws_example_flops_mflops_value = KWS_DSCNN_FLOPs.to(MFLOPs).magnitude + # ┌── 2. CALCULATION (The Physics) ──────────────────────────────────────── + # Memory footprints + mnv1_fp32_mb = mnv1_params_m * 4 # FP32: 4 bytes/param + mnv1_int8_mb = mnv1_params_m * 1 # INT8: 1 byte/param + mnv2_fp32_mb = mnv2_params_m * 4 + mnv2_int8_mb = mnv2_params_m * 1 -# --- Energy calculations --- -energy_per_inf_mj_value = ( - inference_power_mw_value * inference_latency_ms_value / 1000 -) -energy_per_day_j_value = ( - inferences_per_day_value * energy_per_inf_mj_value / 1000 -) + # KWS reference (too small for 50-species task) + kws_example_params_k = KWS_DSCNN_PARAMS.m_as(Kparam) + kws_example_flops_mflops = KWS_DSCNN_FLOPs.m_as(MFLOPs) -# --- Outputs (formatted strings for prose) --- -mnv1_params_str = fmt(mnv1_params_m_value, precision=1, commas=False) # e.g. "4.2" -mnv1_flops_str = fmt(mnv1_flops_mflops_value, precision=0, commas=False) # e.g. "569" -mnv1_fp32_str = fmt(mnv1_fp32_mb_value, precision=0, commas=False) # e.g. "17" -mnv1_int8_str = fmt(mnv1_int8_mb_value, precision=0, commas=False) # e.g. "4" -mnv2_params_str = fmt(mnv2_params_m_value, precision=1, commas=False) # e.g. "2.2" -mnv2_flops_str = fmt(mnv2_flops_mflops_value, precision=0, commas=False) # e.g. "150" -mnv2_fp32_str = fmt(mnv2_fp32_mb_value, precision=0, commas=False) # e.g. "9" -mnv2_int8_str = fmt(mnv2_int8_mb_value, precision=1, commas=False) # e.g. "2.2" -kws_example_params_str = fmt(kws_example_params_k_value, precision=0, commas=False) # e.g. "26" -kws_example_flops_str = fmt(kws_example_flops_mflops_value, precision=0, commas=False) # e.g. "6" -energy_mj_str = fmt(energy_per_inf_mj_value, precision=0, commas=False) # e.g. "15" -energy_j_str = fmt(energy_per_day_j_value, precision=1, commas=False) # e.g. "1.5" + # Energy + energy_per_inf_mj = inference_power_mw * inference_latency_ms / 1000 + energy_per_day_j = inferences_per_day * energy_per_inf_mj / 1000 + + # ┌── 3. INVARIANTS (Guardrails) ────────────────────────────────────────── + check(mnv2_int8_mb < 512, "MobileNetV2 INT8 must fit in 512 MB edge RAM.") + + # ┌── 4. OUTPUTS (Formatting) ───────────────────────────────────────────── + mnv1_params_str = fmt(mnv1_params_m, precision=1, commas=False) # e.g. "4.2" + mnv1_flops_str = fmt(mnv1_flops_mflops, precision=0, commas=False) # e.g. "569" + mnv1_fp32_str = fmt(mnv1_fp32_mb, precision=0, commas=False) # e.g. "17" + mnv1_int8_str = fmt(mnv1_int8_mb, precision=0, commas=False) # e.g. "4" + mnv2_params_str = fmt(mnv2_params_m, precision=1, commas=False) # e.g. "2.2" + mnv2_flops_str = fmt(mnv2_flops_mflops, precision=0, commas=False) # e.g. "150" + mnv2_fp32_str = fmt(mnv2_fp32_mb, precision=0, commas=False) # e.g. "9" + mnv2_int8_str = fmt(mnv2_int8_mb, precision=1, commas=False) # e.g. "2.2" + kws_example_params_str = fmt(kws_example_params_k, precision=0, commas=False) # e.g. "26" + kws_example_flops_str = fmt(kws_example_flops_mflops, precision=0, commas=False) # e.g. "6" + energy_mj_str = fmt(energy_per_inf_mj, precision=0, commas=False) # e.g. "15" + energy_j_str = fmt(energy_per_day_j, precision=1, commas=False) # e.g. "1.5" + +# ┌── EXPORTS (Bridge to Text) ───────────────────────────────────────────────── +mnv1_params_str = WildlifeModelSizing.mnv1_params_str +mnv1_flops_str = WildlifeModelSizing.mnv1_flops_str +mnv1_fp32_str = WildlifeModelSizing.mnv1_fp32_str +mnv1_int8_str = WildlifeModelSizing.mnv1_int8_str +mnv2_params_str = WildlifeModelSizing.mnv2_params_str +mnv2_flops_str = WildlifeModelSizing.mnv2_flops_str +mnv2_fp32_str = WildlifeModelSizing.mnv2_fp32_str +mnv2_int8_str = WildlifeModelSizing.mnv2_int8_str +kws_example_params_str = WildlifeModelSizing.kws_example_params_str +kws_example_flops_str = WildlifeModelSizing.kws_example_flops_str +energy_mj_str = WildlifeModelSizing.energy_mj_str +energy_j_str = WildlifeModelSizing.energy_j_str ``` With the throughput ceiling established, we can now apply the complete decision framework to a realistic scenario that exercises every step. @@ -4099,11 +4184,23 @@ Engineers add attention to CNNs or convolutions to Transformers expecting additi from mlsys.constants import A100_MEM_CAPACITY, GiB -# --- 8-GPU cluster memory --- -a100_8x_mem_value = int(A100_MEM_CAPACITY.to(GiB).magnitude) * 8 +class A100ClusterMemory: + """Contrast datacenter and edge memory: 8-GPU A100 node vs 4 GB edge device.""" -# --- Outputs (formatted strings for prose) --- -a100_8x_mem_str = f"{a100_8x_mem_value}" # e.g. "640" + # ┌── 1. PARAMETERS (Inputs) ────────────────────────────────────────────── + n_gpus = 8 + + # ┌── 2. CALCULATION (The Physics) ──────────────────────────────────────── + a100_8x_mem = int(A100_MEM_CAPACITY.m_as(GiB)) * n_gpus + + # ┌── 3. INVARIANTS (Guardrails) ────────────────────────────────────────── + check(a100_8x_mem > 400, "8x A100 cluster should provide >400 GiB memory.") + + # ┌── 4. OUTPUTS (Formatting) ───────────────────────────────────────────── + a100_8x_mem_str = f"{a100_8x_mem}" # e.g. "640" + +# ┌── EXPORTS (Bridge to Text) ───────────────────────────────────────────────── +a100_8x_mem_str = A100ClusterMemory.a100_8x_mem_str ``` **Pitfall:** *Optimizing architectural decisions for training hardware without considering deployment constraints.* diff --git a/book/quarto/contents/vol1/nn_computation/nn_computation.qmd b/book/quarto/contents/vol1/nn_computation/nn_computation.qmd index 940b85aa5..d688b242a 100644 --- a/book/quarto/contents/vol1/nn_computation/nn_computation.qmd +++ b/book/quarto/contents/vol1/nn_computation/nn_computation.qmd @@ -60,24 +60,35 @@ Neural networks reduce to a small set of mathematical operations. Matrix multipl # ┌───────────────────────────────────────────────────────────────────────────── # │ DL PRIMER COMMON IMPORTS # ├───────────────────────────────────────────────────────────────────────────── -# │ Context: Shared imports and unit definitions for all compute cells in -# │ this chapter. +# │ Context: Chapter-scoped preamble cell supporting all compute cells from +# │ @sec-neural-computation-evolution-ml-paradigms-ec9c onward. +# │ Also seeds inf_madd_total_str used in the Purpose opening paragraph. # │ -# │ Goal: Centralize import statements for the chapter. -# │ Show: A clean namespace for subsequent calculations. -# │ How: Import all constants and formatting helpers from mlsys. +# │ Goal: Centralize unit imports and compute the MNIST forward-pass MAC count +# │ used in the Purpose section before any architecture cell runs. +# │ Show: A clean shared namespace and the 109,184 MAC count for 784→128→64→10. +# │ How: Wildcard-import mlsys.constants for units; count MACs as the sum of +# │ three layer products (784×128 + 128×64 + 64×10). # │ # │ Imports: mlsys.constants (*), mlsys.formatting (fmt, sci), # │ mlsys.formulas (model_memory) -# │ Exports: All mlsys.constants units and hardware specs +# │ Exports: inf_madd_total_str (Purpose prose), all mlsys.constants units # └───────────────────────────────────────────────────────────────────────────── + from mlsys.constants import * from mlsys.formatting import fmt, sci from mlsys.formulas import model_memory -# MNIST 784→128→64→10 MAC count (used in Purpose / From Logic to Arithmetic) -_inf_madd = 784 * 128 + 128 * 64 + 64 * 10 # 109,184 -inf_madd_total_str = f"{_inf_madd:,}" +# ┌── P.I.C.O. ISOLATED SCENARIO ─────────────────────────────────────────────── +class DlPrimerImports: + """Namespace for Dl Primer Imports.""" + + # MNIST 784→128→64→10 MAC count (used in Purpose / From Logic to Arithmetic) + _inf_madd = 784 * 128 + 128 * 64 + 64 * 10 # 109,184 + inf_madd_total_str = f"{_inf_madd:,}" + +# ┌── EXPORTS (Bridge to Text) ───────────────────────────────────────────────── +inf_madd_total_str = DlPrimerImports.inf_madd_total_str ``` ## From Logic to Arithmetic {#sec-neural-computation-deep-learning-systems-engineering-foundation-597f} @@ -122,66 +133,92 @@ To ground this arc in a concrete systems story, we start by following a single M ```{python} #| echo: false #| label: paradigm-systems-cost + # ┌───────────────────────────────────────────────────────────────────────────── # │ PARADIGM SYSTEMS COST — RUNNING MNIST EXAMPLE # ├───────────────────────────────────────────────────────────────────────────── -# │ Context: "Computing with Patterns" running example tracing the same -# │ 28×28 MNIST digit through rule-based, classical ML, and deep learning -# │ paradigms to make the systems cost escalation quantitative. +# │ Context: @sec-neural-computation-evolution-ml-paradigms-ec9c — running example +# │ comparing rule-based, classical ML (HOG + SVM), and deep learning on +# │ the same 28×28 MNIST digit to quantify systems cost escalation. # │ -# │ Goal: Trace the computational cost of pattern recognition across paradigms. -# │ Show: The order-of-magnitude cost escalation from rule-based to deep learning. -# │ How: Compare ops and memory for processing a single 28×28 digit. +# │ Goal: Establish concrete operation and memory counts for each paradigm so +# │ that prose can state the escalation factor precisely. +# │ Show: The ~1,500× MAC increase from rule-based (~100 ops) to deep learning +# │ (~109K MACs) and the proportional memory footprint growth. +# │ How: Hard-code pixel count, HOG cell grid, and MLP layer dimensions; compute +# │ ops and memory for each paradigm using arithmetic only. # │ -# │ Imports: mlsys.formatting (fmt) -# │ Exports: *_str variables for inline use in prose +# │ Imports: mlsys.formatting (fmt, check), +# │ mlsys.constants (KIB_TO_BYTES, MILLION, THOUSAND) +# │ Exports: rb_pixels_str, rb_ops_str, rb_mem_str, hog_features_str, +# │ hog_ops_approx_str, hog_mem_str, dl_total_macs_str, dl_params_str, +# │ dl_weight_kb_str, dl_ops_ratio_str # └───────────────────────────────────────────────────────────────────────────── + from mlsys.formatting import fmt, check from mlsys.constants import KIB_TO_BYTES, MILLION, THOUSAND -# === Rule-Based Paradigm === -# Simple threshold comparisons on 28×28 = 784 pixels -rb_pixels = 28 * 28 # 784 -rb_ops = 100 # ~100 comparisons (generous) -rb_mem_bytes = rb_pixels # 784 bytes working set -rb_pixels_str = fmt(rb_pixels, precision=0, commas=True) # "784" -rb_ops_str = fmt(rb_ops, precision=0, commas=False) # "100" -rb_mem_str = fmt(rb_mem_bytes, precision=0, commas=True) # "784" +# ┌── P.I.C.O. ISOLATED SCENARIO ─────────────────────────────────────────────── +class ParadigmSystemsCost: + """Namespace for Paradigm Systems Cost.""" -# === Classical ML (HOG + linear classifier) === -# HOG on 28×28: 7×7 grid of 4×4 cells, 9 orientation bins -hog_cells = 7 * 7 # 49 cells -hog_bins = 9 # orientation bins per cell -hog_features = hog_cells * hog_bins # 441 features -# Gradient computation: ~2 ops per pixel (dx, dy) + histogram binning -hog_gradient_ops = rb_pixels * 2 # 1,568 -hog_binning_ops = rb_pixels * 3 # ~2,352 (magnitude, angle, bin) -hog_classify_ops = hog_features * 10 # ~4,410 (SVM: 10 classes, dot products) -hog_total_ops = hog_gradient_ops + hog_binning_ops + hog_classify_ops # ~8,330 -hog_mem_kb = 2 # ~2 KB (image + gradients + histograms) -hog_grid_str = "7" # grid dimension -hog_bins_str = fmt(hog_bins, precision=0, commas=False) # "9" -hog_features_str = fmt(hog_features, precision=0, commas=True) # "441" -hog_ops_str = fmt(hog_total_ops, precision=0, commas=True) # "8,330" -hog_ops_approx_str = "8,000" # rounded for prose -hog_mem_str = fmt(hog_mem_kb, precision=0, commas=False) # "2" + # === Rule-Based Paradigm === + # Simple threshold comparisons on 28×28 = 784 pixels + rb_pixels = 28 * 28 # 784 + rb_ops = 100 # ~100 comparisons (generous) + rb_mem_bytes = rb_pixels # 784 bytes working set + rb_pixels_str = fmt(rb_pixels, precision=0, commas=True) # "784" + rb_ops_str = fmt(rb_ops, precision=0, commas=False) # "100" + rb_mem_str = fmt(rb_mem_bytes, precision=0, commas=True) # "784" -# === Deep Learning (784→128→64→10 MLP) === -# Forward pass MACs per layer -dl_l1_macs = 784 * 128 # 100,352 -dl_l2_macs = 128 * 64 # 8,192 -dl_l3_macs = 64 * 10 # 640 -dl_total_macs = dl_l1_macs + dl_l2_macs + dl_l3_macs # 109,184 -# Parameters (weights only, no bias for simplicity) -dl_params = dl_l1_macs + dl_l2_macs + dl_l3_macs # same as MACs for dense layers -dl_params_with_bias = dl_params + 128 + 64 + 10 # 109,386 -dl_weight_bytes = dl_params_with_bias * 4 # FP32 = 4 bytes each -dl_weight_kb = dl_weight_bytes / KIB_TO_BYTES # ~427 KB -dl_total_macs_str = fmt(dl_total_macs, precision=0, commas=True) # "109,184" -dl_params_str = fmt(dl_params_with_bias, precision=0, commas=True) # "109,386" -dl_weight_kb_str = fmt(dl_weight_kb, precision=0, commas=False) # "427" -dl_ops_ratio = dl_total_macs / rb_ops # ~1,092× -dl_ops_ratio_str = fmt(dl_ops_ratio, precision=0, commas=True) # "1,092" + # === Classical ML (HOG + linear classifier) === + # HOG on 28×28: 7×7 grid of 4×4 cells, 9 orientation bins + hog_cells = 7 * 7 # 49 cells + hog_bins = 9 # orientation bins per cell + hog_features = hog_cells * hog_bins # 441 features + # Gradient computation: ~2 ops per pixel (dx, dy) + histogram binning + hog_gradient_ops = rb_pixels * 2 # 1,568 + hog_binning_ops = rb_pixels * 3 # ~2,352 (magnitude, angle, bin) + hog_classify_ops = hog_features * 10 # ~4,410 (SVM: 10 classes, dot products) + hog_total_ops = hog_gradient_ops + hog_binning_ops + hog_classify_ops # ~8,330 + hog_mem_kb = 2 # ~2 KB (image + gradients + histograms) + hog_grid_str = "7" # grid dimension + hog_bins_str = fmt(hog_bins, precision=0, commas=False) # "9" + hog_features_str = fmt(hog_features, precision=0, commas=True) # "441" + hog_ops_str = fmt(hog_total_ops, precision=0, commas=True) # "8,330" + hog_ops_approx_str = "8,000" # rounded for prose + hog_mem_str = fmt(hog_mem_kb, precision=0, commas=False) # "2" + + # === Deep Learning (784→128→64→10 MLP) === + # Forward pass MACs per layer + dl_l1_macs = 784 * 128 # 100,352 + dl_l2_macs = 128 * 64 # 8,192 + dl_l3_macs = 64 * 10 # 640 + dl_total_macs = dl_l1_macs + dl_l2_macs + dl_l3_macs # 109,184 + # Parameters (weights only, no bias for simplicity) + dl_params = dl_l1_macs + dl_l2_macs + dl_l3_macs # same as MACs for dense layers + dl_params_with_bias = dl_params + 128 + 64 + 10 # 109,386 + dl_weight_bytes = dl_params_with_bias * 4 # FP32 = 4 bytes each + dl_weight_kb = dl_weight_bytes / KIB_TO_BYTES # ~427 KB + dl_total_macs_str = fmt(dl_total_macs, precision=0, commas=True) # "109,184" + dl_params_str = fmt(dl_params_with_bias, precision=0, commas=True) # "109,386" + dl_weight_kb_str = fmt(dl_weight_kb, precision=0, commas=False) # "427" + dl_ops_ratio = dl_total_macs / rb_ops # ~1,092× + dl_ops_ratio_str = fmt(dl_ops_ratio, precision=0, commas=True) # "1,092" + +# ┌── EXPORTS (Bridge to Text) ───────────────────────────────────────────────── +rb_pixels_str = ParadigmSystemsCost.rb_pixels_str +rb_ops_str = ParadigmSystemsCost.rb_ops_str +rb_mem_str = ParadigmSystemsCost.rb_mem_str +hog_grid_str = ParadigmSystemsCost.hog_grid_str +hog_bins_str = ParadigmSystemsCost.hog_bins_str +hog_features_str = ParadigmSystemsCost.hog_features_str +hog_ops_approx_str = ParadigmSystemsCost.hog_ops_approx_str +hog_mem_str = ParadigmSystemsCost.hog_mem_str +dl_total_macs_str = ParadigmSystemsCost.dl_total_macs_str +dl_params_str = ParadigmSystemsCost.dl_params_str +dl_weight_kb_str = ParadigmSystemsCost.dl_weight_kb_str +dl_ops_ratio_str = ParadigmSystemsCost.dl_ops_ratio_str ``` The shift from logic to arithmetic reshapes how we encode real-world patterns in a form a computer can process. To make this evolution concrete, we track a single task across all three paradigms: classifying a handwritten digit from a 28 $\times$ 28 pixel image from the MNIST[^fn-mnist-dataset] dataset (the same input used throughout this chapter). Watch how the computational profile changes as representation strategies evolve. @@ -376,14 +413,18 @@ This architecture exhibits predictable scaling\index{Scalability!deep learning}: # ┌───────────────────────────────────────────────────────────────────────────── # │ FIG-DOUBLE-DESCENT # ├───────────────────────────────────────────────────────────────────────────── -# │ Context: @fig-double-descent illustrating the double descent phenomenon -# │ in model complexity vs test error. +# │ Context: @fig-double-descent in @sec-neural-computation-automatic-pattern-discovery-214d, +# │ illustrating why overparameterized deep learning defies classical +# │ statistical Bias-Variance theory. # │ -# │ Goal: Visualize the double descent phenomenon in overparameterized models. -# │ Show: That "bigger is better" holds past the interpolation threshold. -# │ How: Plot test error vs. model complexity showing the U-shape and second descent. +# │ Goal: Visualize the U-shape classical regime, the spike at the interpolation +# │ threshold, and the second descent into the modern overparameterized regime. +# │ Show: That "bigger is better" holds past the interpolation threshold — +# │ the empirical basis for 100B+ parameter frontier models. +# │ How: Generate a synthetic curve with a masked U-shape for the classical regime +# │ and an exponential decay for the modern regime; smooth with convolution. # │ -# │ Imports: numpy (np), mlsys.viz (viz) +# │ Imports: numpy (np), mlsys.viz (setup_plot, COLORS) # │ Exports: (figure only — no prose variables) # └───────────────────────────────────────────────────────────────────────────── import numpy as np @@ -572,15 +613,19 @@ While the preceding sections established the technical foundations of deep learn # ┌───────────────────────────────────────────────────────────────────────────── # │ FIG-TRENDS — COMPUTATIONAL GROWTH SCATTER PLOT # ├───────────────────────────────────────────────────────────────────────────── -# │ Context: @fig-trends showing seven decades of AI training compute on a -# │ log scale, revealing the acceleration from 1.4× to 3.4-month doubling. +# │ Context: @fig-trends in @sec-neural-computation-evolution-neural-network-computing-8754, +# │ visually grounding the 4–5× annual compute growth cited in prose. # │ -# │ Goal: Visualize the acceleration of AI training compute requirements. -# │ Show: The shift from 2-year to 3.4-month doubling times after 2012. -# │ How: Plot training compute (FLOPs) vs. year on a logarithmic scale. +# │ Goal: Plot seven decades of AI training compute on a log scale, exposing +# │ the 1952–2010 linear trend (1.4×/year) and the 2012–2025 acceleration +# │ (doubling every ~3.4 months) driven by the deep learning era. +# │ Show: The inflection point at AlexNet (2012) where compute growth detached +# │ from Moore's Law, creating the hardware investment arms race. +# │ How: Load data/all_ai_models.csv; extract training compute per year; fit +# │ OLS log-linear trend lines for each era; annotate key models. # │ # │ Imports: pandas (pd), numpy (np), matplotlib.pyplot (plt), -# │ datetime (datetime), os, mlsys.viz (setup_plot, COLORS) +# │ datetime, os, mlsys.viz (setup_plot, COLORS) # │ Exports: (figure only — no prose variables) # └───────────────────────────────────────────────────────────────────────────── import pandas as pd @@ -778,13 +823,19 @@ else: # ┌───────────────────────────────────────────────────────────────────────────── # │ HISTORICAL MODEL PARAMETERS # ├───────────────────────────────────────────────────────────────────────────── -# │ Context: @tbl-historical-performance showing 4 decades of NN evolution +# │ Context: @tbl-historical-performance in +# │ @sec-neural-computation-evolution-neural-network-computing-8754, +# │ providing the GPT-3 and GPT-4 cells that require live constants. # │ -# │ Goal: Provide concrete scale anchors for the evolution of neural networks. -# │ Show: The 8-order-of-magnitude leap from LeNet to GPT-4. -# │ How: Retrieve parameter counts for key historical models. +# │ Goal: Supply exact GPT-3 parameter count (from mlsys.constants) and an +# │ estimated GPT-4 MoE count for the 1989–2023 historical comparison table. +# │ Show: The 8-order-of-magnitude parameter leap from LeNet-1 (~9.8K) to +# │ GPT-4 (~1.8T), anchoring the co-scaling narrative. +# │ How: Read GPT3_PARAMS and convert to Bparam; treat GPT-4 as 1.8T (MoE +# │ estimate); compute scale factor as a guardrail invariant. # │ -# │ Imports: mlsys.constants (GPT3_PARAMS, Bparam), mlsys.formatting (fmt) +# │ Imports: mlsys.constants (GPT3_PARAMS, Bparam, THOUSAND), +# │ mlsys.formatting (fmt, check) # │ Exports: gpt3_params_b_str, gpt4_params_t_str # └───────────────────────────────────────────────────────────────────────────── from mlsys.constants import GPT3_PARAMS, Bparam, THOUSAND @@ -798,7 +849,7 @@ class HistoricalScale: """ # ┌── 1. PARAMETERS (Inputs) ─────────────────────────────────────────────── - gpt3_params_b = GPT3_PARAMS.to(Bparam).magnitude + gpt3_params_b = GPT3_PARAMS.m_as(Bparam) gpt4_params_t = 1.8 # Estimate (MoE) # ┌── 2. CALCULATION (The Physics) ───────────────────────────────────────── @@ -949,30 +1000,53 @@ To ground these concepts in a concrete example, we use handwritten digit recogni # ┌───────────────────────────────────────────────────────────────────────────── # │ MNIST ARCHITECTURE CONSTANTS # ├───────────────────────────────────────────────────────────────────────────── -# │ Context: Running Example callout and multiple subsequent references +# │ Context: "Running Example: MNIST Digit Recognition" callout at the start of +# │ @sec-neural-computation-network-architecture-fundamentals-1f58; +# │ topology constants reused by mnist-memory-calc, mnist-flops-calc, +# │ mnist-training-memory-calc, and mnist-weights-calc cells below. # │ -# │ Goal: Define the canonical MNIST architecture for the running example. -# │ Show: The 784→128→64→10 topology used throughout the chapter. -# │ How: Set constants for input, hidden, and output layer dimensions. +# │ Goal: Define the canonical 784→128→64→10 MLP topology once, serving as the +# │ single source of truth for all subsequent MNIST running-example cells. +# │ Show: The four layer dimensions (input, two hidden, output) and the +# │ human-readable architecture string used in prose and callouts. +# │ How: Derive input dimension from MNIST_IMAGE_WIDTH × MNIST_IMAGE_HEIGHT; +# │ assign hidden and output layer widths as named constants. # │ # │ Imports: mlsys.constants (MNIST_IMAGE_WIDTH, MNIST_IMAGE_HEIGHT) -# │ Exports: mnist_l1_dim, mnist_l2_dim, mnist_l3_dim, mnist_l4_dim, -# │ mnist_arch_str, mnist_input_str +# │ Exports: mnist_l1_dim (784), mnist_l2_dim (128), mnist_l3_dim (64), +# │ mnist_l4_dim (10), mnist_arch_str ("784→128→64→10"), mnist_input_str +# │ +# │ Note: mnist_l*_dim and mnist_arch_str are consumed by subsequent cells +# │ up to mnist-weights-calc (~2700 lines below this cell). These are +# │ intentionally chapter-scoped topology constants. # └───────────────────────────────────────────────────────────────────────────── + from mlsys.constants import MNIST_IMAGE_WIDTH, MNIST_IMAGE_HEIGHT -# --- Inputs (canonical MNIST architecture) --- -mnist_l1_dim = 784 # input: 28×28 pixels -mnist_l2_dim = 128 # hidden layer 1 -mnist_l3_dim = 64 # hidden layer 2 -mnist_l4_dim = 10 # output: 10 digit classes +# ┌── P.I.C.O. ISOLATED SCENARIO ─────────────────────────────────────────────── +class MnistArchitectureConstants: + """Namespace for Mnist Architecture Constants.""" -# --- Derived values --- -mnist_input_neurons_value = MNIST_IMAGE_WIDTH * MNIST_IMAGE_HEIGHT # 28×28 = 784 + # ┌── 1. PARAMETERS (Inputs) ────────────────────────────────────────────── + mnist_l1_dim = 784 # input: 28×28 pixels + mnist_l2_dim = 128 # hidden layer 1 + mnist_l3_dim = 64 # hidden layer 2 + mnist_l4_dim = 10 # output: 10 digit classes -# --- Outputs (formatted strings for prose) --- -mnist_arch_str = f"{mnist_l1_dim}→{mnist_l2_dim}→{mnist_l3_dim}→{mnist_l4_dim}" # e.g. "784→128→64→10" -mnist_input_str = f"{mnist_input_neurons_value}" # e.g. "784" + # ┌── 2. CALCULATION (The Physics) ──────────────────────────────────────── + mnist_input_neurons_value = MNIST_IMAGE_WIDTH * MNIST_IMAGE_HEIGHT # 28×28 = 784 + + # ┌── 4. OUTPUTS (Formatting) ───────────────────────────────────────────── + mnist_arch_str = f"{mnist_l1_dim}→{mnist_l2_dim}→{mnist_l3_dim}→{mnist_l4_dim}" # e.g. "784→128→64→10" + mnist_input_str = f"{mnist_input_neurons_value}" # e.g. "784" + +# ┌── EXPORTS (Bridge to Text) ───────────────────────────────────────────────── +mnist_l1_dim = MnistArchitectureConstants.mnist_l1_dim +mnist_l2_dim = MnistArchitectureConstants.mnist_l2_dim +mnist_l3_dim = MnistArchitectureConstants.mnist_l3_dim +mnist_l4_dim = MnistArchitectureConstants.mnist_l4_dim +mnist_arch_str = MnistArchitectureConstants.mnist_arch_str +mnist_input_str = MnistArchitectureConstants.mnist_input_str ``` ::: {.callout-example title="Running Example: MNIST Digit Recognition"} @@ -1608,45 +1682,73 @@ This simple network demonstrates how hidden layers enable learning non-linear pa # ┌───────────────────────────────────────────────────────────────────────────── # │ MNIST NETWORK SCALE COMPARISON # ├───────────────────────────────────────────────────────────────────────────── -# │ Context: Footnote [^fn-computational-scale] comparing large vs small nets +# │ Context: Footnote [^fn-computational-scale] in +# │ @sec-neural-computation-feedforward-network-architecture-d2cf, +# │ quantifying the memory-accuracy trade-off for two MNIST MLP variants. # │ -# │ Goal: Illustrate the diminishing returns of model scaling. -# │ Show: That a 20× resource increase yields only marginal accuracy gains on MNIST. -# │ How: Compare parameter counts for large vs. small MLP variants. +# │ Goal: Show that a 20× parameter increase (784→1000→1000→10 vs 784→100→100→10) +# │ yields only a marginal accuracy improvement (99.5% vs 98.5%) on MNIST. +# │ Show: ~1.8M parameters requiring ~7 MB vs ~89K parameters requiring ~347 KB, +# │ motivating architecture design discipline for mobile deployment. +# │ How: Sum weights (input×output) + biases (output) per layer for both +# │ architectures; convert to MB/KB using model_memory(). # │ # │ Imports: mlsys.constants (BYTES_FP32, MB, KiB, param, Mparam, Kparam), # │ mlsys.formulas (model_memory) -# │ Exports: mnist_large_*, mnist_small_* +# │ Exports: mnist_large_params_m_str, mnist_large_mem_mb_str, +# │ mnist_small_params_k_str, mnist_small_mem_kb_str # └───────────────────────────────────────────────────────────────────────────── + from mlsys.constants import BYTES_FP32, MB, KiB, param, Mparam, Kparam from mlsys.formulas import model_memory -# --- Inputs (large architecture: 784→1000→1000→10) --- -mnist_large_l1 = 1000 # hidden layer 1 width -mnist_large_l2 = 1000 # hidden layer 2 width -mnist_large_arch = [(mnist_l1_dim, mnist_large_l1), - (mnist_large_l1, mnist_large_l2), - (mnist_large_l2, mnist_l4_dim)] +# ┌── P.I.C.O. ISOLATED SCENARIO ─────────────────────────────────────────────── +class MnistScaleComparison: + """Namespace for Mnist Scale Comparison.""" -# --- Inputs (small architecture: 784→100→100→10) --- -mnist_small_l1 = 100 # hidden layer 1 width -mnist_small_l2 = 100 # hidden layer 2 width -mnist_small_arch = [(mnist_l1_dim, mnist_small_l1), - (mnist_small_l1, mnist_small_l2), - (mnist_small_l2, mnist_l4_dim)] + # ┌── 1. PARAMETERS (Inputs) ────────────────────────────────────────────── + mnist_large_l1 = 1000 # hidden layer 1 width + mnist_large_l2 = 1000 # hidden layer 2 width + mnist_large_arch = [(mnist_l1_dim, mnist_large_l1), + (mnist_large_l1, mnist_large_l2), + (mnist_large_l2, mnist_l4_dim)] -# --- Process (parameter and memory calculations) --- -mnist_large_params = sum(i * o + o for i, o in mnist_large_arch) -mnist_large_mem_mb = model_memory(mnist_large_params, BYTES_FP32, MB) + # Small architecture inputs + mnist_small_l1 = 100 # hidden layer 1 width + mnist_small_l2 = 100 # hidden layer 2 width + mnist_small_arch = [(mnist_l1_dim, mnist_small_l1), + (mnist_small_l1, mnist_small_l2), + (mnist_small_l2, mnist_l4_dim)] -mnist_small_params = sum(i * o + o for i, o in mnist_small_arch) -mnist_small_mem_kb = model_memory(mnist_small_params, BYTES_FP32, KiB) + # ┌── 2. CALCULATION (The Physics) ──────────────────────────────────────── + # Use explicit for-loops: generator expressions skip the class namespace in Python 3 + _large_params = 0 + for _i, _o in mnist_large_arch: + _large_params += _i * _o + _o + mnist_large_params = _large_params + mnist_large_mem_mb = model_memory(mnist_large_params, BYTES_FP32, MB) -# --- Outputs (formatted strings for prose) --- -mnist_large_params_m_str = f"{(mnist_large_params * param).to(Mparam).magnitude:.1f}" # e.g. "1.8" -mnist_large_mem_mb_str = f"{mnist_large_mem_mb:.0f}" # e.g. "7" -mnist_small_params_k_str = f"{(mnist_small_params * param).to(Kparam).magnitude:.0f}" # e.g. "89" -mnist_small_mem_kb_str = f"{mnist_small_mem_kb:.0f}" # e.g. "347" + _small_params = 0 + for _i, _o in mnist_small_arch: + _small_params += _i * _o + _o + mnist_small_params = _small_params + mnist_small_mem_kb = model_memory(mnist_small_params, BYTES_FP32, KiB) + + # ┌── 4. OUTPUTS (Formatting) ───────────────────────────────────────────── + mnist_large_params_m_str = f"{(mnist_large_params * param).m_as(Mparam):.1f}" # e.g. "1.8" + mnist_large_mem_mb_str = f"{mnist_large_mem_mb:.0f}" # e.g. "7" + mnist_small_params_k_str = f"{(mnist_small_params * param).m_as(Kparam):.0f}" # e.g. "89" + mnist_small_mem_kb_str = f"{mnist_small_mem_kb:.0f}" # e.g. "347" + +# ┌── EXPORTS (Bridge to Text) ───────────────────────────────────────────────── +mnist_large_l1 = MnistScaleComparison.mnist_large_l1 +mnist_large_l2 = MnistScaleComparison.mnist_large_l2 +mnist_small_l1 = MnistScaleComparison.mnist_small_l1 +mnist_small_l2 = MnistScaleComparison.mnist_small_l2 +mnist_large_params_m_str = MnistScaleComparison.mnist_large_params_m_str +mnist_large_mem_mb_str = MnistScaleComparison.mnist_large_mem_mb_str +mnist_small_params_k_str = MnistScaleComparison.mnist_small_params_k_str +mnist_small_mem_kb_str = MnistScaleComparison.mnist_small_mem_kb_str ``` The XOR example established the canonical three-layer architecture, but real-world networks require systematic consideration of design constraints and computational scale[^fn-computational-scale]. Recognizing handwritten digits using the MNIST\index{MNIST Dataset!digit recognition}\index{Dataset!MNIST benchmark} [@lecun1998gradient] dataset illustrates how problem structure determines network dimensions while hidden layer configuration remains an important design decision. @@ -1872,21 +1974,33 @@ These connection patterns have significant implications for both the theoretical # ┌───────────────────────────────────────────────────────────────────────────── # │ MNIST MEMORY CALC — TRAINING VS INFERENCE FOOTPRINT # ├───────────────────────────────────────────────────────────────────────────── -# │ Context: MNIST memory callout, per-layer parameter/activation tables, -# │ and forward-pass FLOP counts referenced throughout the chapter. +# │ Context: Primary computation hub for the running example, feeding: +# │ - Per-layer parameter and activation tables in +# │ @sec-neural-computation-model-size-computational-complexity-1f0f +# │ - "Memory: Training vs. Inference" callout (preceding mnist-training-memory-calc) +# │ - Forward-pass FLOP analysis in @sec-neural-computation-forward-pass-computation-e5dd +# │ - Backpropagation memory analysis in +# │ @sec-neural-computation-gradient-computation-backpropagation-dacf # │ -# │ Goal: Establish the quantitative backbone for the running example. -# │ Show: Exact parameter, memory, and FLOP counts for the 784→128→64→10 MLP. -# │ How: Calculate per-layer stats for weights, biases, and activations. +# │ Goal: Establish the complete quantitative backbone — parameter counts, memory +# │ footprint, activation sizes, FLOP totals, and arithmetic intensity — +# │ for the 784→128→64→10 MLP at batch=32. +# │ Show: Exact per-layer stats (weights, biases, params, activations) and the +# │ training_ratio showing training requires 4× inference memory. +# │ How: Iterate over layer dimension pairs; compute weights = din × dout, +# │ biases = dout; accumulate activation memory for batch=32; derive FLOPs +# │ as 2×MACs×batch; compute arithmetic intensity as FLOPs/ModelBytes. # │ -# │ Imports: mlsys.formatting (fmt), -# │ mlsys.constants (BYTES_FP32, flop, MFLOPs, KFLOPs) -# │ Exports: param_mem_str, grad_mem_str, opt_mem_str, total_act_str, -# │ training_mb_str, inference_kb_str, training_ratio_str, -# │ w1_str..w3_str, b1_str..b3_str, t1_str..t3_str, -# │ total_params_str, total_mops_str, per_image_kops_str, -# │ layer1_pct_str, arith_intensity_str, batch_*_str, grad_l*_str, -# │ bp_*_str, inf_madd_*_str +# │ Imports: mlsys.formatting (fmt, check), +# │ mlsys.constants (BYTES_FP32, flop, MFLOPs, KFLOPs, MILLION, +# │ THOUSAND, KIB_TO_BYTES) +# │ Exports: MNISTMemory class (all attributes), BackpropMemory class (all +# │ attributes), plus top-level strings: w1_str, w2_str, w3_str, +# │ b1_str, b2_str, b3_str, p1_str, p2_str, p3_str, total_params_str, +# │ training_ratio_str, inf_madd_l1_str, inf_madd_l2_str, inf_madd_l3_str +# │ +# │ Note: MNISTMemory and BackpropMemory attributes are consumed by +# │ mnist-training-memory-calc and mnist-flops-calc cells (~300 lines below). # └───────────────────────────────────────────────────────────────────────────── from mlsys.formatting import fmt, check from mlsys.constants import BYTES_FP32, flop, MFLOPs, KFLOPs, MILLION, THOUSAND, KIB_TO_BYTES @@ -2112,46 +2226,82 @@ Parameter count grows with network width and depth. For our MNIST example, consi # ┌───────────────────────────────────────────────────────────────────────────── # │ MNIST TRAINING VS INFERENCE MEMORY — DETAILED BREAKDOWN # ├───────────────────────────────────────────────────────────────────────────── -# │ Context: Callout "Memory: Training vs. Inference" +# │ Context: "Memory: Training vs. Inference" worked example callout in +# │ @sec-neural-computation-model-size-computational-complexity-1f0f # │ -# │ Goal: Contrast training and inference memory usage. -# │ Show: That training requires >2× more memory due to stored activations. -# │ How: Sum weights, gradients, optimizer state, and activations for batch 32. +# │ Goal: Compute per-layer and total memory for the 784→128→64→10 MLP at +# │ batch=32, then contrast training vs. inference footprint. +# │ Show: That training requires ~4× more memory than inference due to gradients +# │ and Adam optimizer state, raising total from ~KB to ~MB. +# │ How: Re-export pre-computed values from MNISTMemory and BackpropMemory +# │ (defined in the preceding mnist-memory-calc cell) with descriptive names +# │ for the per-layer table in the callout. # │ # │ Imports: mlsys.formatting (fmt, check) -# │ Exports: p1_str, p2_str, p3_str, act_*_str, grad_kib_str, etc. +# │ Exports: param_kib_str, grad_kib_str, opt_kib_str, act_in_count_str, +# │ act_in_kib_str, act_h1_count_str, act_h1_kib_str, act_h2_count_str, +# │ act_h2_kib_str, act_out_count_str, act_out_kib_str, +# │ total_act_count_str, total_act_kib_str, +# │ total_train_mib_str, total_infer_kib_str, +# │ bp_input_kb_str, bp_h1_kb_str, bp_h2_kb_str, bp_out_kb_str # └───────────────────────────────────────────────────────────────────────────── + from mlsys.formatting import fmt, check -# Use MNISTMemory and BackpropMemory as Single Source of Truth -# (Variables already exported in previous cell) +# ┌── P.I.C.O. ISOLATED SCENARIO ─────────────────────────────────────────────── +class MnistTrainingMemoryCalc: + """Namespace for Mnist Training Memory Calc.""" -# Re-exporting specifically for the prose below this cell for clarity -# Canonical MNIST (MNISTMemory) -param_kib_str = fmt(MNISTMemory.param_mem_kb, precision=1, commas=False) -grad_kib_str = fmt(MNISTMemory.grad_mem_kb, precision=1, commas=False) -opt_kib_str = fmt(MNISTMemory.opt_mem_kb, precision=1, commas=False) + # Use MNISTMemory and BackpropMemory as Single Source of Truth + # (Variables already exported in previous cell) -act_in_count_str = f"{MNISTMemory.batch_act_sizes[0]:,}" -act_in_kib_str = fmt((MNISTMemory.batch_act_sizes[0] * 4) / 1024, precision=1, commas=False) -act_h1_count_str = f"{MNISTMemory.batch_act_sizes[1]:,}" -act_h1_kib_str = fmt((MNISTMemory.batch_act_sizes[1] * 4) / 1024, precision=1, commas=False) -act_h2_count_str = f"{MNISTMemory.batch_act_sizes[2]:,}" -act_h2_kib_str = fmt((MNISTMemory.batch_act_sizes[2] * 4) / 1024, precision=1, commas=False) -act_out_count_str = f"{MNISTMemory.batch_act_sizes[3]:,}" -act_out_kib_str = fmt((MNISTMemory.batch_act_sizes[3] * 4) / 1024, precision=1, commas=False) + # Re-exporting specifically for the prose below this cell for clarity + # Canonical MNIST (MNISTMemory) + param_kib_str = fmt(MNISTMemory.param_mem_kb, precision=1, commas=False) + grad_kib_str = fmt(MNISTMemory.grad_mem_kb, precision=1, commas=False) + opt_kib_str = fmt(MNISTMemory.opt_mem_kb, precision=1, commas=False) -total_act_count_str = f"{sum(MNISTMemory.batch_act_sizes):,}" -total_act_kib_str = fmt(MNISTMemory.act_mem_kb, precision=1, commas=False) + act_in_count_str = f"{MNISTMemory.batch_act_sizes[0]:,}" + act_in_kib_str = fmt((MNISTMemory.batch_act_sizes[0] * 4) / 1024, precision=1, commas=False) + act_h1_count_str = f"{MNISTMemory.batch_act_sizes[1]:,}" + act_h1_kib_str = fmt((MNISTMemory.batch_act_sizes[1] * 4) / 1024, precision=1, commas=False) + act_h2_count_str = f"{MNISTMemory.batch_act_sizes[2]:,}" + act_h2_kib_str = fmt((MNISTMemory.batch_act_sizes[2] * 4) / 1024, precision=1, commas=False) + act_out_count_str = f"{MNISTMemory.batch_act_sizes[3]:,}" + act_out_kib_str = fmt((MNISTMemory.batch_act_sizes[3] * 4) / 1024, precision=1, commas=False) -total_train_mib_str = fmt(MNISTMemory.training_total_kb / 1024, precision=1, commas=False) -total_infer_kib_str = fmt(MNISTMemory.inference_total_kb, precision=0, commas=False) + total_act_count_str = f"{sum(MNISTMemory.batch_act_sizes):,}" + total_act_kib_str = fmt(MNISTMemory.act_mem_kb, precision=1, commas=False) -# Backprop Wider Network (BackpropMemory) -bp_input_kb_str = BackpropMemory.bp_input_kb_str -bp_h1_kb_str = BackpropMemory.act_l1_kb_str -bp_h2_kb_str = BackpropMemory.act_l2_kb_str -bp_out_kb_str = fmt((BackpropMemory.act_counts[3] * 4) / 1024, precision=1, commas=False) + total_train_mib_str = fmt(MNISTMemory.training_total_kb / 1024, precision=1, commas=False) + total_infer_kib_str = fmt(MNISTMemory.inference_total_kb, precision=0, commas=False) + + # Backprop Wider Network (BackpropMemory) + bp_input_kb_str = BackpropMemory.bp_input_kb_str + bp_h1_kb_str = BackpropMemory.act_l1_kb_str + bp_h2_kb_str = BackpropMemory.act_l2_kb_str + bp_out_kb_str = fmt((BackpropMemory.act_counts[3] * 4) / 1024, precision=1, commas=False) + +# ┌── EXPORTS (Bridge to Text) ───────────────────────────────────────────────── +param_kib_str = MnistTrainingMemoryCalc.param_kib_str +grad_kib_str = MnistTrainingMemoryCalc.grad_kib_str +opt_kib_str = MnistTrainingMemoryCalc.opt_kib_str +act_in_count_str = MnistTrainingMemoryCalc.act_in_count_str +act_in_kib_str = MnistTrainingMemoryCalc.act_in_kib_str +act_h1_count_str = MnistTrainingMemoryCalc.act_h1_count_str +act_h1_kib_str = MnistTrainingMemoryCalc.act_h1_kib_str +act_h2_count_str = MnistTrainingMemoryCalc.act_h2_count_str +act_h2_kib_str = MnistTrainingMemoryCalc.act_h2_kib_str +act_out_count_str = MnistTrainingMemoryCalc.act_out_count_str +act_out_kib_str = MnistTrainingMemoryCalc.act_out_kib_str +total_act_count_str = MnistTrainingMemoryCalc.total_act_count_str +total_act_kib_str = MnistTrainingMemoryCalc.total_act_kib_str +total_train_mib_str = MnistTrainingMemoryCalc.total_train_mib_str +total_infer_kib_str = MnistTrainingMemoryCalc.total_infer_kib_str +bp_input_kb_str = MnistTrainingMemoryCalc.bp_input_kb_str +bp_h1_kb_str = MnistTrainingMemoryCalc.bp_h1_kb_str +bp_h2_kb_str = MnistTrainingMemoryCalc.bp_h2_kb_str +bp_out_kb_str = MnistTrainingMemoryCalc.bp_out_kb_str ``` ::: {.callout-example title="Memory: Training vs. Inference"} @@ -2209,47 +2359,67 @@ The memory requirements above seem modest for our small MNIST classifier. But wh # ┌───────────────────────────────────────────────────────────────────────────── # │ MEMORY EXPLOSION COMPARISON # ├───────────────────────────────────────────────────────────────────────────── -# │ Context: "The Memory Explosion" callout comparing MNIST to GPT-2 +# │ Context: "The Memory Explosion" callout in +# │ @sec-neural-computation-model-size-computational-complexity-1f0f, +# │ quantifying the scale gap that motivates production GPU requirements. # │ -# │ Goal: Illustrate the scale gap between toy and production models. -# │ Show: The 1.5 million-fold jump from MNIST to GPT-2 parameter scale. -# │ How: Compare parameter counts and memory footprints for both architectures. +# │ Goal: Contrast the toy MNIST MLP (~107K params, ~420 KB) with GPT-2 +# │ (124M params, ~475 MB) to make the 1,160× parameter scale-up visceral. +# │ Show: That crossing from toy to production scale requires GPU memory rather +# │ than CPU RAM, justifying the accelerator infrastructure introduced in +# │ @sec-hardware-acceleration. +# │ How: Re-derive MNIST params from the canonical architecture; read GPT2_PARAMS +# │ from mlsys.constants; compute memory with model_memory(); express ratio. # │ # │ Imports: mlsys.constants (GPT2_PARAMS, BYTES_FP32, param, Kparam, Bparam, -# │ KiB, GB), mlsys.formulas (model_memory), mlsys.formatting (fmt) +# │ KiB, GB), mlsys.formulas (model_memory), mlsys.formatting (fmt, check) # │ Exports: mnist_params_count_str, mnist_mem_str, mnist_params_k_str, # │ gpt2_params_count_str, gpt2_params_b_str, gpt2_mem_str, mem_jump_str # └───────────────────────────────────────────────────────────────────────────── + from mlsys.constants import GPT2_PARAMS, BYTES_FP32, param, Kparam, Bparam, KiB, GB from mlsys.formulas import model_memory from mlsys.formatting import fmt, check -# --- Re-derived inputs (MNIST canonical architecture: 784→128→64→10) --- -mnist_l1_dim = 784 # input: 28×28 pixels -mnist_l2_dim = 128 # hidden layer 1 -mnist_l3_dim = 64 # hidden layer 2 -mnist_l4_dim = 10 # output: 10 digit classes -mnist_arch_value = [(mnist_l1_dim, mnist_l2_dim), - (mnist_l2_dim, mnist_l3_dim), - (mnist_l3_dim, mnist_l4_dim)] -mnist_params_value = sum(i * o + o for i, o in mnist_arch_value) # weights + biases +# ┌── P.I.C.O. ISOLATED SCENARIO ─────────────────────────────────────────────── +class MemoryExplosionCalc: + """Namespace for Memory Explosion Calc.""" -# --- Process (memory calculations) --- -mnist_mem_kb_value = model_memory(mnist_params_value, BYTES_FP32, KiB) # Uses 1024 base -gpt2_params_count_value = GPT2_PARAMS.to(param).magnitude -gpt2_params_b_value = GPT2_PARAMS.to(Bparam).magnitude -gpt2_mem_gb_value = model_memory(GPT2_PARAMS, BYTES_FP32, GB) -mem_jump_value = gpt2_params_count_value / mnist_params_value + # --- Re-derived inputs (MNIST canonical architecture: 784→128→64→10) --- + mnist_l1_dim = 784 # input: 28×28 pixels + mnist_l2_dim = 128 # hidden layer 1 + mnist_l3_dim = 64 # hidden layer 2 + mnist_l4_dim = 10 # output: 10 digit classes + mnist_arch_value = [(mnist_l1_dim, mnist_l2_dim), + (mnist_l2_dim, mnist_l3_dim), + (mnist_l3_dim, mnist_l4_dim)] + mnist_params_value = sum(i * o + o for i, o in mnist_arch_value) # weights + biases -# --- Outputs (formatted strings for prose) --- -mnist_params_count_str = f"{mnist_params_value:,}" # e.g. "109,386" -mnist_params_k_str = fmt((mnist_params_value * param).to(Kparam).magnitude, - precision=0, commas=False) # e.g. "109" -mnist_mem_str = fmt(mnist_mem_kb_value, precision=0, commas=False) # e.g. "427" -gpt2_params_count_str = fmt(gpt2_params_count_value, precision=0, commas=True) # e.g. "1,558,000,000" -gpt2_params_b_str = fmt(gpt2_params_b_value, precision=1, commas=False) # e.g. "1.6" -gpt2_mem_str = fmt(gpt2_mem_gb_value, precision=0, commas=False) # e.g. "6" -mem_jump_str = fmt(mem_jump_value, precision=0, commas=True) # e.g. "14,244" + # ┌── 2. CALCULATION (The Physics) ──────────────────────────────────────── + mnist_mem_kb_value = model_memory(mnist_params_value, BYTES_FP32, KiB) # Uses 1024 base + gpt2_params_count_value = GPT2_PARAMS.m_as(param) + gpt2_params_b_value = GPT2_PARAMS.m_as(Bparam) + gpt2_mem_gb_value = model_memory(GPT2_PARAMS, BYTES_FP32, GB) + mem_jump_value = gpt2_params_count_value / mnist_params_value + + # ┌── 4. OUTPUTS (Formatting) ───────────────────────────────────────────── + mnist_params_count_str = f"{mnist_params_value:,}" # e.g. "109,386" + mnist_params_k_str = fmt((mnist_params_value * param).m_as(Kparam), + precision=0, commas=False) # e.g. "109" + mnist_mem_str = fmt(mnist_mem_kb_value, precision=0, commas=False) # e.g. "427" + gpt2_params_count_str = fmt(gpt2_params_count_value, precision=0, commas=True) # e.g. "1,558,000,000" + gpt2_params_b_str = fmt(gpt2_params_b_value, precision=1, commas=False) # e.g. "1.6" + gpt2_mem_str = fmt(gpt2_mem_gb_value, precision=0, commas=False) # e.g. "6" + mem_jump_str = fmt(mem_jump_value, precision=0, commas=True) # e.g. "14,244" + +# ┌── EXPORTS (Bridge to Text) ───────────────────────────────────────────────── +mnist_params_count_str = MemoryExplosionCalc.mnist_params_count_str +mnist_mem_str = MemoryExplosionCalc.mnist_mem_str +mnist_params_k_str = MemoryExplosionCalc.mnist_params_k_str +gpt2_params_count_str = MemoryExplosionCalc.gpt2_params_count_str +gpt2_params_b_str = MemoryExplosionCalc.gpt2_params_b_str +gpt2_mem_str = MemoryExplosionCalc.gpt2_mem_str +mem_jump_str = MemoryExplosionCalc.mem_jump_str ``` ::: {.callout-notebook title="The Memory Explosion"} @@ -2266,38 +2436,55 @@ The memory calculations above are precise but slow. Experienced engineers develo ```{python} #| label: mental-math-calc #| echo: false + # ┌───────────────────────────────────────────────────────────────────────────── # │ MENTAL MATH CALC — QUICK GPU FEASIBILITY CHECK # ├───────────────────────────────────────────────────────────────────────────── -# │ Context: Callout "Quick Estimation for ML Engineers" +# │ Context: "Quick Estimation for ML Engineers" callout in +# │ @sec-neural-computation-model-size-computational-complexity-1f0f # │ -# │ Goal: Teach the napkin math for estimating training memory requirements. -# │ Show: That parameters alone undercount memory usage by ~4×. -# │ How: Calculate total footprint (weights + grads + optimizer) for a 100M param model. +# │ Goal: Demonstrate the 4× training memory multiplier (weights + gradients +# │ + Adam momentum + Adam velocity) for a 100M-parameter FP32 model on +# │ a 16 GB GPU, leaving room for activations. +# │ Show: That 100M × 4 bytes × 4 = 1.6 GB — the model fits comfortably, +# │ teaching engineers to estimate feasibility before coding. +# │ How: Multiply params × bytes_per_param × overhead_factor; subtract from GPU +# │ capacity; express residual as available activation headroom. # │ -# │ Imports: mlsys.formatting (fmt), mlsys.constants (byte, GB) +# │ Imports: mlsys.formatting (fmt, check), mlsys.constants (byte, GB, MILLION) # │ Exports: mm_model_str, mm_remaining_str, mm_params_m_str, mm_gpu_gb_str # └───────────────────────────────────────────────────────────────────────────── + from mlsys.formatting import fmt, check from mlsys.constants import byte, GB -# --- Inputs (hypothetical 100M-param model on a 16 GB GPU) --- -mm_params_m_value = 100 -mm_bytes_value = 4 -mm_overhead_value = 4 # params + grads + optimizer states -mm_gpu_gb_value = 16 +# ┌── P.I.C.O. ISOLATED SCENARIO ─────────────────────────────────────────────── +class MentalMathCalc: + """Namespace for Mental Math Calc.""" -# --- Process (total training memory footprint) --- -mm_model_gb_value = ( - mm_params_m_value * MILLION * mm_bytes_value * mm_overhead_value * byte -).to(GB).magnitude -mm_remaining_gb_value = mm_gpu_gb_value - mm_model_gb_value + # ┌── 1. PARAMETERS (Inputs) ────────────────────────────────────────────── + mm_params_m_value = 100 + mm_bytes_value = 4 + mm_overhead_value = 4 # params + grads + optimizer states + mm_gpu_gb_value = 16 -# --- Outputs (formatted strings for prose) --- -mm_model_str = fmt(mm_model_gb_value, precision=1, commas=False) -mm_remaining_str = fmt(mm_remaining_gb_value, precision=0, commas=False) -mm_params_m_str = str(mm_params_m_value) -mm_gpu_gb_str = str(mm_gpu_gb_value) + # ┌── 2. CALCULATION (The Physics) ──────────────────────────────────────── + mm_model_gb_value = ( + mm_params_m_value * MILLION * mm_bytes_value * mm_overhead_value * byte + ).m_as(GB) + mm_remaining_gb_value = mm_gpu_gb_value - mm_model_gb_value + + # ┌── 4. OUTPUTS (Formatting) ───────────────────────────────────────────── + mm_model_str = fmt(mm_model_gb_value, precision=1, commas=False) + mm_remaining_str = fmt(mm_remaining_gb_value, precision=0, commas=False) + mm_params_m_str = str(mm_params_m_value) + mm_gpu_gb_str = str(mm_gpu_gb_value) + +# ┌── EXPORTS (Bridge to Text) ───────────────────────────────────────────────── +mm_model_str = MentalMathCalc.mm_model_str +mm_remaining_str = MentalMathCalc.mm_remaining_str +mm_params_m_str = MentalMathCalc.mm_params_m_str +mm_gpu_gb_str = MentalMathCalc.mm_gpu_gb_str ``` ::: {.callout-notebook title="Quick Estimation for ML Engineers"} @@ -2570,79 +2757,102 @@ For each image in the batch, this produces a probability distribution over the p ```{python} #| label: mnist-flops-calc #| echo: false + # ┌───────────────────────────────────────────────────────────────────────────── # │ MNIST FLOPS CALC — FORWARD-PASS OPERATION COUNT # ├───────────────────────────────────────────────────────────────────────────── -# │ Context: Callout "Counting Ops in Forward Pass" +# │ Context: "Counting Ops in Forward Pass" worked example callout in +# │ @sec-neural-computation-matrix-multiplication-formulation-417c # │ -# │ Goal: Build intuition for the arithmetic cost of neural networks. -# │ Show: The $O(\text{width}^2)$ scaling of dense layer computation. -# │ How: Calculate FLOPs for the canonical MLP and its 2× width variant. +# │ Goal: Count FLOPs per layer for the canonical 784→128→64→10 MLP at batch=32, +# │ then show that doubling hidden widths (→256→128) causes ~4× FLOP growth. +# │ Show: That dense FC layers scale as O(width²), so a 2× width increase costs +# │ ~4×, making architecture decisions a dominant systems variable. +# │ How: Apply the formula 2×Batch×M×N for each matmul layer plus 2×Batch×N for +# │ bias+activation; compare original vs. doubled-width totals. # │ -# │ Imports: mlsys.formatting (fmt), mlsys.constants (flop, MFLOPs) -# │ Exports: l1_mm_str..l3_bias_str, total_mops_str, -# │ double_total_mops_str, double_ratio_str, double_ratio_exact_str +# │ Imports: mlsys.formatting (fmt, check), mlsys.constants (flop, MFLOPs) +# │ Exports: l1_mm_str, l1_bias_str, l2_mm_str, l2_bias_str, l3_mm_str, +# │ l3_bias_str, total_mops_str, double_total_mops_str, +# │ double_ratio_str, double_ratio_exact_str # └───────────────────────────────────────────────────────────────────────────── + from mlsys.formatting import fmt, check from mlsys.constants import flop, MFLOPs -# --- Inputs (MNIST architecture and doubled variant) --- -batch_size_value = 32 -in_dim_value = 784 -h1_value = 128 -h2_value = 64 -out_dim_value = 10 -double_h1_value = 256 -double_h2_value = 128 +# ┌── P.I.C.O. ISOLATED SCENARIO ─────────────────────────────────────────────── +class MnistFlopsCalc: + """Namespace for Mnist Flops Calc.""" -# --- Process (per-layer FLOP counts: 2×M×K×N for matmul) --- -flops_l1_mm_value = 2 * batch_size_value * in_dim_value * h1_value -flops_l1_bias_value = 2 * (batch_size_value * h1_value) # Bias add + ReLU + # ┌── 1. PARAMETERS (Inputs) ────────────────────────────────────────────── + batch_size_value = 32 + in_dim_value = 784 + h1_value = 128 + h2_value = 64 + out_dim_value = 10 + double_h1_value = 256 + double_h2_value = 128 -flops_l2_mm_value = 2 * batch_size_value * h1_value * h2_value -flops_l2_bias_value = 2 * (batch_size_value * h2_value) + # ┌── 2. CALCULATION (The Physics) ──────────────────────────────────────── + flops_l1_mm_value = 2 * batch_size_value * in_dim_value * h1_value + flops_l1_bias_value = 2 * (batch_size_value * h1_value) # Bias add + ReLU -flops_l3_mm_value = 2 * batch_size_value * h2_value * out_dim_value -flops_l3_bias_value = 2 * (batch_size_value * out_dim_value) # Bias + Softmax approx + flops_l2_mm_value = 2 * batch_size_value * h1_value * h2_value + flops_l2_bias_value = 2 * (batch_size_value * h2_value) -total_flops_value = ( - flops_l1_mm_value - + flops_l1_bias_value - + flops_l2_mm_value - + flops_l2_bias_value - + flops_l3_mm_value - + flops_l3_bias_value -) -total_mops_value = (total_flops_value * flop).to(MFLOPs).magnitude + flops_l3_mm_value = 2 * batch_size_value * h2_value * out_dim_value + flops_l3_bias_value = 2 * (batch_size_value * out_dim_value) # Bias + Softmax approx -double_flops_l1_mm_value = 2 * batch_size_value * in_dim_value * double_h1_value -double_flops_l1_bias_value = 2 * (batch_size_value * double_h1_value) -double_flops_l2_mm_value = 2 * batch_size_value * double_h1_value * double_h2_value -double_flops_l2_bias_value = 2 * (batch_size_value * double_h2_value) -double_flops_l3_mm_value = 2 * batch_size_value * double_h2_value * out_dim_value -double_flops_l3_bias_value = 2 * (batch_size_value * out_dim_value) -double_total_flops_value = ( - double_flops_l1_mm_value - + double_flops_l1_bias_value - + double_flops_l2_mm_value - + double_flops_l2_bias_value - + double_flops_l3_mm_value - + double_flops_l3_bias_value -) -double_total_mops_value = (double_total_flops_value * flop).to(MFLOPs).magnitude -double_ratio_value = double_total_mops_value / total_mops_value + total_flops_value = ( + flops_l1_mm_value + + flops_l1_bias_value + + flops_l2_mm_value + + flops_l2_bias_value + + flops_l3_mm_value + + flops_l3_bias_value + ) + total_mops_value = (total_flops_value * flop).m_as(MFLOPs) -# --- Outputs (formatted strings for prose) --- -l1_mm_str = f"{flops_l1_mm_value:,}" -l1_bias_str = f"{flops_l1_bias_value:,}" -l2_mm_str = f"{flops_l2_mm_value:,}" -l2_bias_str = f"{flops_l2_bias_value:,}" -l3_mm_str = f"{flops_l3_mm_value:,}" -l3_bias_str = f"{flops_l3_bias_value:,}" -total_mops_str = fmt(total_mops_value, precision=1, commas=False) -double_total_mops_str = fmt(double_total_mops_value, precision=1, commas=False) -double_ratio_str = fmt(double_ratio_value, precision=1, commas=False) -double_ratio_exact_str = fmt(double_ratio_value, precision=2, commas=False) + double_flops_l1_mm_value = 2 * batch_size_value * in_dim_value * double_h1_value + double_flops_l1_bias_value = 2 * (batch_size_value * double_h1_value) + double_flops_l2_mm_value = 2 * batch_size_value * double_h1_value * double_h2_value + double_flops_l2_bias_value = 2 * (batch_size_value * double_h2_value) + double_flops_l3_mm_value = 2 * batch_size_value * double_h2_value * out_dim_value + double_flops_l3_bias_value = 2 * (batch_size_value * out_dim_value) + double_total_flops_value = ( + double_flops_l1_mm_value + + double_flops_l1_bias_value + + double_flops_l2_mm_value + + double_flops_l2_bias_value + + double_flops_l3_mm_value + + double_flops_l3_bias_value + ) + double_total_mops_value = (double_total_flops_value * flop).m_as(MFLOPs) + double_ratio_value = double_total_mops_value / total_mops_value + + # ┌── 4. OUTPUTS (Formatting) ───────────────────────────────────────────── + l1_mm_str = f"{flops_l1_mm_value:,}" + l1_bias_str = f"{flops_l1_bias_value:,}" + l2_mm_str = f"{flops_l2_mm_value:,}" + l2_bias_str = f"{flops_l2_bias_value:,}" + l3_mm_str = f"{flops_l3_mm_value:,}" + l3_bias_str = f"{flops_l3_bias_value:,}" + total_mops_str = fmt(total_mops_value, precision=1, commas=False) + double_total_mops_str = fmt(double_total_mops_value, precision=1, commas=False) + double_ratio_str = fmt(double_ratio_value, precision=1, commas=False) + double_ratio_exact_str = fmt(double_ratio_value, precision=2, commas=False) + +# ┌── EXPORTS (Bridge to Text) ───────────────────────────────────────────────── +l1_mm_str = MnistFlopsCalc.l1_mm_str +l1_bias_str = MnistFlopsCalc.l1_bias_str +l2_mm_str = MnistFlopsCalc.l2_mm_str +l2_bias_str = MnistFlopsCalc.l2_bias_str +l3_mm_str = MnistFlopsCalc.l3_mm_str +l3_bias_str = MnistFlopsCalc.l3_bias_str +total_mops_str = MnistFlopsCalc.total_mops_str +double_total_mops_str = MnistFlopsCalc.double_total_mops_str +double_ratio_str = MnistFlopsCalc.double_ratio_str +double_ratio_exact_str = MnistFlopsCalc.double_ratio_exact_str ``` ::: {.callout-example title="Counting Ops in Forward Pass"} @@ -3116,28 +3326,43 @@ The transition from training to inference introduces a constraint on model adapt # ┌───────────────────────────────────────────────────────────────────────────── # │ GPU SPECIFICATIONS FOR TRAINING FOOTNOTE # ├───────────────────────────────────────────────────────────────────────────── -# │ Context: Footnote [^fn-training-gpu-specs] describing modern training GPUs +# │ Context: Footnote [^fn-training-gpu-specs] in +# │ @sec-neural-computation-operational-phase-differences-3f95, +# │ supporting the prose on training-cluster GPU requirements. # │ -# │ Goal: Ground abstract training requirements in concrete silicon. -# │ Show: The memory capacity and power constraints of datacenter GPUs. -# │ How: Retrieve specs for A100 and H100 from mlsys.constants. +# │ Goal: Ground the "training requires high-memory GPUs" claim with exact A100 +# │ and H100 specs from mlsys.constants (single source of truth). +# │ Show: A100 provides 80 GiB HBM2e at 400 W; H100 consumes up to 700 W — +# │ numbers that explain why datacenter cooling is a first-order constraint. +# │ How: Read A100_MEM_CAPACITY, A100_TDP, and H100_TDP; extract scalar values +# │ with m_as(); format as integer strings for the footnote. # │ # │ Imports: mlsys.constants (A100_MEM_CAPACITY, A100_TDP, H100_TDP, GiB, watt), -# │ mlsys.formatting (fmt) +# │ mlsys.formatting (fmt, check) # │ Exports: a100_mem_gb_str, a100_tdp_w_str, h100_tdp_w_str # └───────────────────────────────────────────────────────────────────────────── + from mlsys.constants import A100_MEM_CAPACITY, A100_TDP, H100_TDP, GiB, watt from mlsys.formatting import fmt, check -# --- Inputs (from mlsys.constants) --- -a100_mem_gb_value = A100_MEM_CAPACITY.to(GiB).magnitude # e.g. 80 -a100_tdp_w_value = A100_TDP.to(watt).magnitude # e.g. 400 -h100_tdp_w_value = H100_TDP.to(watt).magnitude # e.g. 700 +# ┌── P.I.C.O. ISOLATED SCENARIO ─────────────────────────────────────────────── +class GpuSpecsFootnote: + """Namespace for Gpu Specs Footnote.""" -# --- Outputs (formatted strings for prose) --- -a100_mem_gb_str = fmt(a100_mem_gb_value, precision=0, commas=False) # e.g. "80" -a100_tdp_w_str = fmt(a100_tdp_w_value, precision=0, commas=False) # e.g. "400" -h100_tdp_w_str = fmt(h100_tdp_w_value, precision=0, commas=False) # e.g. "700" + # ┌── 1. PARAMETERS (Inputs) ────────────────────────────────────────────── + a100_mem_gb_value = A100_MEM_CAPACITY.m_as(GiB) # e.g. 80 + a100_tdp_w_value = A100_TDP.m_as(watt) # e.g. 400 + h100_tdp_w_value = H100_TDP.m_as(watt) # e.g. 700 + + # ┌── 4. OUTPUTS (Formatting) ───────────────────────────────────────────── + a100_mem_gb_str = fmt(a100_mem_gb_value, precision=0, commas=False) # e.g. "80" + a100_tdp_w_str = fmt(a100_tdp_w_value, precision=0, commas=False) # e.g. "400" + h100_tdp_w_str = fmt(h100_tdp_w_value, precision=0, commas=False) # e.g. "700" + +# ┌── EXPORTS (Bridge to Text) ───────────────────────────────────────────────── +a100_mem_gb_str = GpuSpecsFootnote.a100_mem_gb_str +a100_tdp_w_str = GpuSpecsFootnote.a100_tdp_w_str +h100_tdp_w_str = GpuSpecsFootnote.h100_tdp_w_str ``` Neural network operation divides into two distinct phases\index{Training vs. Inference!operational differences}\index{Inference!forward pass only} with markedly different computational requirements. @fig-training-vs-inference contrasts these phases visually. @@ -3547,27 +3772,42 @@ Once captured, the raw images are far from ready for neural network processing. # ┌───────────────────────────────────────────────────────────────────────────── # │ USPS LENET SPECIFICATIONS # ├───────────────────────────────────────────────────────────────────────────── -# │ Context: USPS case study comparing 1989 LeNet to modern systems +# │ Context: @sec-neural-computation-engineering-process-design-decisions-2e8e +# │ and the "Then vs. Now" comparison callout in +# │ @sec-neural-computation-key-engineering-lessons-design-principles-f84e # │ -# │ Goal: Contrast historical LeNet scale with modern baselines. -# │ Show: That early networks were 10× smaller than even today's toy models. -# │ How: Calculate parameter count and memory for LeNet-1. +# │ Goal: Compute memory footprint for the 1989 LeNet-1 (~10,000 parameters) to +# │ quantify how compact early neural networks were relative to modern models. +# │ Show: LeNet-1 required only ~39 KB of FP32 storage — 10× smaller than even +# │ the toy MNIST MLP in the running example, anchoring the "Then vs. Now" +# │ hardware progress table. +# │ How: Set lenet_1_params = 10,000 (approximate historical value); compute +# │ memory in KiB using model_memory() with BYTES_FP32 precision. # │ # │ Imports: mlsys.constants (BYTES_FP32, KiB), mlsys.formulas (model_memory) # │ Exports: lenet_1_params_str, lenet_1_mem_kb_str # └───────────────────────────────────────────────────────────────────────────── + from mlsys.constants import BYTES_FP32, KiB from mlsys.formulas import model_memory -# --- Inputs (historical LeNet-1 architecture) --- -lenet_1_params = 10000 # approx params in 1989 LeNet +# ┌── P.I.C.O. ISOLATED SCENARIO ─────────────────────────────────────────────── +class UspsLenetSpecs: + """Namespace for Usps Lenet Specs.""" -# --- Process (memory calculation) --- -lenet_1_mem_kb = model_memory(lenet_1_params, BYTES_FP32, KiB) + # ┌── 1. PARAMETERS (Inputs) ────────────────────────────────────────────── + lenet_1_params = 10000 # approx params in 1989 LeNet -# --- Outputs (formatted strings for prose) --- -lenet_1_params_str = f"{lenet_1_params:,}" # e.g. "10,000" -lenet_1_mem_kb_str = f"{lenet_1_mem_kb:.0f}" # e.g. "39" + # ┌── 2. CALCULATION (The Physics) ──────────────────────────────────────── + lenet_1_mem_kb = model_memory(lenet_1_params, BYTES_FP32, KiB) + + # ┌── 4. OUTPUTS (Formatting) ───────────────────────────────────────────── + lenet_1_params_str = f"{lenet_1_params:,}" # e.g. "10,000" + lenet_1_mem_kb_str = f"{lenet_1_mem_kb:.0f}" # e.g. "39" + +# ┌── EXPORTS (Bridge to Text) ───────────────────────────────────────────────── +lenet_1_params_str = UspsLenetSpecs.lenet_1_params_str +lenet_1_mem_kb_str = UspsLenetSpecs.lenet_1_mem_kb_str ``` \index{LeNet!architecture comparison} @@ -3710,30 +3950,46 @@ Neural network architecture demonstrates hierarchical processing, where each lay ```{python} #| label: mnist-weights-calc #| echo: false + # ┌───────────────────────────────────────────────────────────────────────────── # │ MNIST WEIGHTS CALC — FIRST-LAYER WEIGHT COUNT # ├───────────────────────────────────────────────────────────────────────────── -# │ Context: Summary section discussion of fully connected layer costs +# │ Context: @sec-neural-computation-summary-f263 — motivates specialized +# │ architectures (@sec-network-architectures) by showing the quadratic +# │ scaling cost of fully connected layers on image inputs. # │ -# │ Goal: Demonstrate the quadratic scaling trap of fully connected layers. -# │ Show: Why dense connectivity scales poorly for high-resolution inputs. -# │ How: Calculate the connection count for the first layer of an MLP. +# │ Goal: Compute the first-layer connection count for a 784-input MLP as a +# │ concrete illustration of why dense FC layers waste compute on images. +# │ Show: That 784 pixels × 128 neurons = 100,352 weights for a single layer, +# │ learning irrelevant long-range pixel relationships that convolutions +# │ avoid by design. +# │ How: Multiply mnist_pixels_value × fc1_neurons_value using built-in math; +# │ no external imports needed. # │ -# │ Imports: (none — uses only built-in arithmetic) +# │ Imports: (none — built-in arithmetic only) # │ Exports: fc1_weights_str, mnist_pixels_str, fc1_neurons_str # └───────────────────────────────────────────────────────────────────────────── -# --- Inputs (canonical MNIST first layer) --- -mnist_pixels_value = 784 -fc1_neurons_value = 128 +# ┌── P.I.C.O. ISOLATED SCENARIO ─────────────────────────────────────────────── +class MnistWeightsCalc: + """Namespace for Mnist Weights Calc.""" -# --- Process (weight count for dense layer) --- -fc1_weights_value = mnist_pixels_value * fc1_neurons_value + # ┌── 1. PARAMETERS (Inputs) ────────────────────────────────────────────── + mnist_pixels_value = 784 + fc1_neurons_value = 128 -# --- Outputs (formatted strings for prose) --- -fc1_weights_str = f"{fc1_weights_value:,}" -mnist_pixels_str = f"{mnist_pixels_value}" -fc1_neurons_str = f"{fc1_neurons_value}" + # ┌── 2. CALCULATION (The Physics) ──────────────────────────────────────── + fc1_weights_value = mnist_pixels_value * fc1_neurons_value + + # ┌── 4. OUTPUTS (Formatting) ───────────────────────────────────────────── + fc1_weights_str = f"{fc1_weights_value:,}" + mnist_pixels_str = f"{mnist_pixels_value}" + fc1_neurons_str = f"{fc1_neurons_value}" + +# ┌── EXPORTS (Bridge to Text) ───────────────────────────────────────────────── +fc1_weights_str = MnistWeightsCalc.fc1_weights_str +mnist_pixels_str = MnistWeightsCalc.mnist_pixels_str +fc1_neurons_str = MnistWeightsCalc.fc1_neurons_str ``` The running MNIST example made this escalation tangible: the same 28 $\times$ 28 digit that required ~`{python} rb_ops_str` rule-based comparisons demanded `{python} dl_total_macs_str` MACs in even a modest three-layer network—a `{python} dl_ops_ratio_str` $\times$ increase that generalizes across the systems dimensions captured in @tbl-evolution. These fundamentals primarily develop the **Algorithm** axis of the D·A·M taxonomy while revealing how algorithmic choices propagate into **Machine** constraints. diff --git a/book/quarto/contents/vol1/optimizations/model_compression.qmd b/book/quarto/contents/vol1/optimizations/model_compression.qmd index f96ae0a30..1c3310f1b 100644 --- a/book/quarto/contents/vol1/optimizations/model_compression.qmd +++ b/book/quarto/contents/vol1/optimizations/model_compression.qmd @@ -26,7 +26,6 @@ start_chapter("vol1:model_compression") ::: - ## Purpose {.unnumbered} \begin{marginfigure} @@ -78,102 +77,137 @@ Bridging that gap requires a systematic discipline of *compression*: trading cap from mlsys.constants import * from mlsys.formatting import fmt, check, sci -# --- Inputs (GPU specs) --- -a100_tflops_fp16_value = A100_FLOPS_FP16_TENSOR.to(TFLOPs / second).magnitude -a100_tflops_int8_value = A100_FLOPS_INT8.to(TFLOPs / second).magnitude -a100_bw_tbs_value = A100_MEM_BW.to(TB / second).magnitude -a100_int8_speedup_value = int(a100_tflops_int8_value / a100_tflops_fp16_value) +class CompressionSetup: + """Chapter-wide constants: GPU specs, energy physics, model sizes, device constraints.""" -# --- Inputs (energy/perf illustrative values) --- -int8_energy_reduction_value = 20 -mobilenet_int8_mj_value = 47 -mobilenet_fp32_mj_value = 312 -tpu_v4_tops_per_w_value = 0.9 -v100_tops_per_w_value = 0.3 -bandwidth_bound_speedup_value = 4 + # ┌── 1. PARAMETERS (Inputs) ────────────────────────────────────────────── + # Illustrative energy/perf values + int8_energy_reduction = 20 + mobilenet_int8_mj = 47 + mobilenet_fp32_mj = 312 + tpu_v4_tops_per_w = 0.9 + v100_tops_per_w = 0.3 + bandwidth_bound_speedup = 4 + llm_7b_params = 7 + gpt3_training_flops_exp = 23 -# --- Inputs (energy: multiply-add operations from constants) --- -energy_dram_value = ENERGY_DRAM_ACCESS_PJ.magnitude -energy_dram_per_byte_value = ENERGY_DRAM_PJ_PER_BYTE.magnitude -energy_flop_fp32_value = ENERGY_FLOP_FP32_PJ.magnitude -energy_flop_int8_value = ENERGY_FLOP_INT8_PJ.magnitude + # ┌── 2. CALCULATION (The Physics) ──────────────────────────────────────── + # A100 specs + a100_tflops_fp16 = A100_FLOPS_FP16_TENSOR.m_as(TFLOPs / second) + a100_tflops_int8 = A100_FLOPS_INT8.m_as(TFLOPs / second) + a100_bw_tbs = A100_MEM_BW.m_as(TB / second) + a100_int8_speedup = int(a100_tflops_int8 / a100_tflops_fp16) -# Energy for addition operations (Horowitz 2014, 45nm process) -energy_add_fp32_pj_value = ENERGY_ADD_FP32_PJ.to(ureg.picojoule).magnitude -energy_add_fp16_pj_value = ENERGY_ADD_FP16_PJ.to(ureg.picojoule).magnitude -energy_add_int32_pj_value = ENERGY_ADD_INT32_PJ.to(ureg.picojoule).magnitude -energy_add_int8_pj_value = ENERGY_ADD_INT8_PJ.to(ureg.picojoule).magnitude -energy_mul_fp32_pj_value = ENERGY_FLOP_FP32_PJ.magnitude + # Energy from constants (Horowitz 2014, 45nm process) + energy_dram = ENERGY_DRAM_ACCESS_PJ.m_as(ureg.picojoule) + energy_dram_per_byte = ENERGY_DRAM_PJ_PER_BYTE.m_as(ureg.picojoule / ureg.byte) + energy_flop_fp32 = ENERGY_FLOP_FP32_PJ.m_as(ureg.picojoule / ureg.count) + energy_flop_int8 = ENERGY_FLOP_INT8_PJ.m_as(ureg.picojoule / ureg.count) + energy_add_fp32_pj = ENERGY_ADD_FP32_PJ.m_as(ureg.picojoule) + energy_add_fp16_pj = ENERGY_ADD_FP16_PJ.m_as(ureg.picojoule) + energy_add_int32_pj = ENERGY_ADD_INT32_PJ.m_as(ureg.picojoule) + energy_add_int8_pj = ENERGY_ADD_INT8_PJ.m_as(ureg.picojoule) + energy_mul_fp32_pj = ENERGY_FLOP_FP32_PJ.m_as(ureg.picojoule / ureg.count) -# INT8 vs FP32 energy ratio (MAC-to-MAC: multiply + add for each precision) -fp32_mac_pj_value = energy_mul_fp32_pj_value + energy_add_fp32_pj_value # 3.7 + 0.9 = 4.6 pJ -int8_mac_pj_value = energy_flop_int8_value + energy_add_int8_pj_value # 0.2 + 0.03 = 0.23 pJ -int8_fp32_energy_ratio_value = fp32_mac_pj_value / int8_mac_pj_value + # INT8 vs FP32 MAC energy ratio + fp32_mac_pj = energy_mul_fp32_pj + energy_add_fp32_pj # 3.7 + 0.9 = 4.6 pJ + int8_mac_pj = energy_flop_int8 + energy_add_int8_pj # 0.2 + 0.03 = 0.23 pJ + int8_fp32_energy_ratio = fp32_mac_pj / int8_mac_pj -# V100 specs -v100_bw_gbs_value = V100_MEM_BW.to(GB / second).magnitude -v100_tflops_fp32_value = V100_FLOPS_FP32.to(TFLOPs / second).magnitude + # V100 specs + v100_bw_gbs = V100_MEM_BW.m_as(GB / second) + v100_tflops_fp32 = V100_FLOPS_FP32.m_as(TFLOPs / second) -# Model specs -resnet_params_m_value = RESNET50_PARAMS.to(Mparam).magnitude -resnet_gflops_value = RESNET50_FLOPs.to(GFLOPs).magnitude -mobilenetv2_mflops_value = MOBILENETV2_FLOPs.to(GFLOPs).magnitude * 1000 + # Model specs + resnet_params_m = RESNET50_PARAMS.m_as(Mparam) + resnet_gflops = RESNET50_FLOPs.m_as(GFLOPs) + mobilenetv2_mflops = MOBILENETV2_FLOPs.m_as(GFLOPs) * 1000 -# LLM parameter/memory calculations -llm_7b_params_value = 7 -llm_7b_mem_fp16_gb_value = llm_7b_params_value * 2 -llm_175b_params_value = GPT3_PARAMS.to(Bparam).magnitude -llm_175b_mem_fp16_gb_value = llm_175b_params_value * 2 + # LLM memory + llm_7b_mem_fp16_gb = llm_7b_params * 2 + llm_175b_params = GPT3_PARAMS.m_as(Bparam) + llm_175b_mem_fp16_gb = llm_175b_params * 2 -# Device memory constraints -smartphone_ram_gb_value = SMARTPHONE_RAM_GB.to(GB).magnitude -mcu_ram_kb_value = MCU_RAM_KIB.to(KiB).magnitude + # Device memory + smartphone_ram_gb = SMARTPHONE_RAM_GB.m_as(GB) + mcu_ram_kb = MCU_RAM_KIB.m_as(KiB) -# GPT-3 training FLOPs -gpt3_training_flops_exp_value = 23 + # ┌── 3. INVARIANTS (Guardrails) ────────────────────────────────────────── + check(a100_int8_speedup >= 2, "A100 INT8 should be at least 2x faster than FP16.") + check(int8_fp32_energy_ratio > 1, "FP32 MAC must cost more energy than INT8 MAC.") -# --- Outputs (formatted strings for prose) --- -a100_tflops_fp16_str = fmt(a100_tflops_fp16_value, precision=0, commas=False) -a100_tflops_int8_str = fmt(a100_tflops_int8_value, precision=0, commas=False) -a100_bw_tbs_str = fmt(a100_bw_tbs_value, precision=1, commas=False) -a100_int8_speedup_str = fmt(a100_int8_speedup_value, precision=0, commas=False) -int8_energy_reduction_str = fmt(int8_energy_reduction_value, precision=0, commas=False) -mobilenet_int8_mj_str = fmt(mobilenet_int8_mj_value, precision=0, commas=False) -mobilenet_fp32_mj_str = fmt(mobilenet_fp32_mj_value, precision=0, commas=False) -tpu_v4_tops_per_w_str = fmt(tpu_v4_tops_per_w_value, precision=1, commas=False) -v100_tops_per_w_str = fmt(v100_tops_per_w_value, precision=1, commas=False) -bandwidth_bound_speedup_str = fmt(bandwidth_bound_speedup_value, precision=0, commas=False) + # ┌── 4. OUTPUTS (Formatting) ───────────────────────────────────────────── + a100_tflops_fp16_str = fmt(a100_tflops_fp16, precision=0, commas=False) + a100_tflops_int8_str = fmt(a100_tflops_int8, precision=0, commas=False) + a100_bw_tbs_str = fmt(a100_bw_tbs, precision=1, commas=False) + a100_int8_speedup_str = fmt(a100_int8_speedup, precision=0, commas=False) + int8_energy_reduction_str = fmt(int8_energy_reduction, precision=0, commas=False) + mobilenet_int8_mj_str = fmt(mobilenet_int8_mj, precision=0, commas=False) + mobilenet_fp32_mj_str = fmt(mobilenet_fp32_mj, precision=0, commas=False) + tpu_v4_tops_per_w_str = fmt(tpu_v4_tops_per_w, precision=1, commas=False) + v100_tops_per_w_str = fmt(v100_tops_per_w, precision=1, commas=False) + bandwidth_bound_speedup_str = fmt(bandwidth_bound_speedup, precision=0, commas=False) + energy_dram_str = fmt(energy_dram, precision=0, commas=False) + energy_dram_per_byte_str = fmt(energy_dram_per_byte, precision=0, commas=False) + energy_flop_fp32_str = f"{energy_flop_fp32}" + energy_flop_int8_str = f"{energy_flop_int8}" + energy_add_fp32_str = f"{energy_add_fp32_pj}" + energy_add_fp16_str = f"{energy_add_fp16_pj}" + energy_add_int32_str = f"{energy_add_int32_pj}" + energy_add_int8_str = f"{energy_add_int8_pj}" + energy_mul_fp32_str = f"{energy_mul_fp32_pj}" + int8_fp32_energy_ratio_str = fmt(int8_fp32_energy_ratio, precision=1, commas=False) + v100_bw_gbs_str = fmt(v100_bw_gbs, precision=0, commas=False) + v100_tflops_fp32_str = fmt(v100_tflops_fp32, precision=1, commas=False) + resnet_params_m_str = fmt(resnet_params_m, precision=1, commas=False) + resnet_gflops_str = fmt(resnet_gflops, precision=1, commas=False) + mobilenetv2_mflops_str = fmt(mobilenetv2_mflops, precision=0, commas=False) + llm_7b_str = f"{llm_7b_params}" + llm_7b_mem_str = fmt(llm_7b_mem_fp16_gb, precision=0, commas=False) + llm_175b_str = fmt(llm_175b_params, precision=0, commas=False) + llm_175b_mem_str = fmt(llm_175b_mem_fp16_gb, precision=0, commas=False) + smartphone_ram_str = f"{smartphone_ram_gb}" + mcu_ram_str = f"{mcu_ram_kb}" + gpt3_training_flops_str = f"$3.14 \\times 10^{{{gpt3_training_flops_exp}}}$" -energy_dram_str = fmt(energy_dram_value, precision=0, commas=False) -energy_dram_per_byte_str = fmt(energy_dram_per_byte_value, precision=0, commas=False) -energy_flop_fp32_str = f"{energy_flop_fp32_value}" -energy_flop_int8_str = f"{energy_flop_int8_value}" - -energy_add_fp32_str = f"{energy_add_fp32_pj_value}" -energy_add_fp16_str = f"{energy_add_fp16_pj_value}" -energy_add_int32_str = f"{energy_add_int32_pj_value}" -energy_add_int8_str = f"{energy_add_int8_pj_value}" -energy_mul_fp32_str = f"{energy_mul_fp32_pj_value}" - -int8_fp32_energy_ratio_str = fmt(int8_fp32_energy_ratio_value, precision=1, commas=False) - -v100_bw_gbs_str = fmt(v100_bw_gbs_value, precision=0, commas=False) -v100_tflops_fp32_str = fmt(v100_tflops_fp32_value, precision=1, commas=False) - -resnet_params_m_str = fmt(resnet_params_m_value, precision=1, commas=False) -resnet_gflops_str = fmt(resnet_gflops_value, precision=1, commas=False) -mobilenetv2_mflops_str = fmt(mobilenetv2_mflops_value, precision=0, commas=False) - -llm_7b_str = f"{llm_7b_params_value}" -llm_7b_mem_str = fmt(llm_7b_mem_fp16_gb_value, precision=0, commas=False) -llm_175b_str = fmt(llm_175b_params_value, precision=0, commas=False) -llm_175b_mem_str = fmt(llm_175b_mem_fp16_gb_value, precision=0, commas=False) -smartphone_ram_str = f"{smartphone_ram_gb_value}" -mcu_ram_str = f"{mcu_ram_kb_value}" -gpt3_training_flops_str = f"$3.14 \\times 10^{{{gpt3_training_flops_exp_value}}}$" +# ┌── EXPORTS (Bridge to Text) ───────────────────────────────────────────────── +a100_tflops_fp16_str = CompressionSetup.a100_tflops_fp16_str +a100_tflops_int8_str = CompressionSetup.a100_tflops_int8_str +a100_bw_tbs_str = CompressionSetup.a100_bw_tbs_str +a100_int8_speedup_str = CompressionSetup.a100_int8_speedup_str +int8_energy_reduction_str = CompressionSetup.int8_energy_reduction_str +mobilenet_int8_mj_str = CompressionSetup.mobilenet_int8_mj_str +mobilenet_fp32_mj_str = CompressionSetup.mobilenet_fp32_mj_str +tpu_v4_tops_per_w_str = CompressionSetup.tpu_v4_tops_per_w_str +v100_tops_per_w_str = CompressionSetup.v100_tops_per_w_str +bandwidth_bound_speedup_str = CompressionSetup.bandwidth_bound_speedup_str +energy_dram_str = CompressionSetup.energy_dram_str +energy_dram_per_byte_str = CompressionSetup.energy_dram_per_byte_str +energy_flop_fp32_str = CompressionSetup.energy_flop_fp32_str +energy_flop_int8_str = CompressionSetup.energy_flop_int8_str +energy_add_fp32_str = CompressionSetup.energy_add_fp32_str +energy_add_fp16_str = CompressionSetup.energy_add_fp16_str +energy_add_int32_str = CompressionSetup.energy_add_int32_str +energy_add_int8_str = CompressionSetup.energy_add_int8_str +energy_mul_fp32_str = CompressionSetup.energy_mul_fp32_str +int8_fp32_energy_ratio_str = CompressionSetup.int8_fp32_energy_ratio_str +v100_bw_gbs_str = CompressionSetup.v100_bw_gbs_str +v100_tflops_fp32_str = CompressionSetup.v100_tflops_fp32_str +resnet_params_m_str = CompressionSetup.resnet_params_m_str +resnet_gflops_str = CompressionSetup.resnet_gflops_str +mobilenetv2_mflops_str = CompressionSetup.mobilenetv2_mflops_str +llm_7b_str = CompressionSetup.llm_7b_str +llm_7b_mem_str = CompressionSetup.llm_7b_mem_str +llm_175b_str = CompressionSetup.llm_175b_str +llm_175b_mem_str = CompressionSetup.llm_175b_mem_str +smartphone_ram_str = CompressionSetup.smartphone_ram_str +mcu_ram_str = CompressionSetup.mcu_ram_str +gpt3_training_flops_str = CompressionSetup.gpt3_training_flops_str +# Note: v100_bw_gbs_value used by downstream fusion-calc cell +v100_bw_gbs_value = CompressionSetup.v100_bw_gbs +v100_tflops_fp32_value = CompressionSetup.v100_tflops_fp32 ``` - ## Optimization Framework {#sec-model-compression-optimization-framework-9e21} A `{python} llm_7b_str`-billion parameter language model requires `{python} llm_7b_mem_str` GB just to store its weights in FP16. Your deployment target is a smartphone with `{python} smartphone_ram_str` GB of RAM shared across the operating system, applications, and your model. *The math does not work.* No amount of clever engineering changes this arithmetic: `{python} llm_7b_mem_str` GB cannot fit in `{python} smartphone_ram_str` GB. Yet users expect the model to run: responsively, offline, without draining their battery in an hour. The gap between what training produces and what deployment permits (the Latency Budget, the maximum allowable end-to-end inference time, defined formally in @sec-model-serving) is not a minor inconvenience but a defining challenge of model compression. @@ -420,7 +454,6 @@ We call this phenomenon *the quantization speedup*. The relative importance of each dimension varies by deployment target. Cloud systems may tolerate larger models but demand throughput; mobile devices prioritize memory and energy; embedded systems face hard constraints on all resources simultaneously. Understanding these deployment contexts shapes which optimization dimensions to prioritize. - ## Deployment Context {#sec-model-compression-deployment-context-0d88} The optimization framework above identifies three dimensions of compression, but which dimensions matter most depends entirely on where the model will run. A datacenter GPU with 80 GB of HBM faces different binding constraints than a smartphone with shared RAM or a microcontroller with 256 KB of SRAM. @tbl-deployment-scenarios summarizes the key constraints across deployment environments. @@ -482,55 +515,80 @@ from mlsys.constants import (GB, GiB, MiB, KiB, MB, KB, byte, CLOUD_MEM_GIB, MOBILE_MEM_GIB, TINY_MEM_KIB, DLRM_MODEL_SIZE_FP32) -# --- Inputs (device capacities and model sizes) --- -cloud_mem_value = CLOUD_MEM_GIB -mobile_mem_value = MOBILE_MEM_GIB -tiny_mem_value = TINY_MEM_KIB - -dlrm_mem_value = DLRM_MODEL_SIZE_FP32 -gpt2_mem_value = 6 * GiB -resnet_mem_value = 100 * MiB -mobilenet_mem_value = 14 * MiB -mobilenet_int8_mem_value = 3.5 * MiB -dscnn_mem_value = 500 * KiB - -# --- Process (compute fit ratios) --- -def get_ratio(model_mem, device_mem): - ratio = model_mem.to(byte).magnitude / device_mem.to(byte).magnitude +def _get_ratio(model_mem, device_mem): + """Return 'ok' if model fits, else 'no (Nx)' with how many times it overflows.""" + ratio = model_mem.m_as(byte) / device_mem.m_as(byte) if ratio < 1: return "ok" return f"no ({ratio:.0f}x)" -dlrm_mobile_value = get_ratio(dlrm_mem_value, mobile_mem_value) -dlrm_tiny_value = get_ratio(dlrm_mem_value, tiny_mem_value) +class ModelDeviceComparison: + """Contrast model requirements with device memory: 6-order-of-magnitude deployment gap.""" -gpt2_mobile_value = get_ratio(gpt2_mem_value, mobile_mem_value) -gpt2_tiny_value = get_ratio(gpt2_mem_value, tiny_mem_value) + # ┌── 1. PARAMETERS (Inputs) ────────────────────────────────────────────── + # Device capacities + cloud_mem = CLOUD_MEM_GIB + mobile_mem = MOBILE_MEM_GIB + tiny_mem = TINY_MEM_KIB -resnet_tiny_value = get_ratio(resnet_mem_value, tiny_mem_value) -mobilenet_tiny_value = get_ratio(mobilenet_mem_value, tiny_mem_value) -mobilenet_int8_tiny_value = get_ratio(mobilenet_int8_mem_value, tiny_mem_value) + # Model sizes + dlrm_mem = DLRM_MODEL_SIZE_FP32 + gpt2_mem = 6 * GiB + resnet_mem = 100 * MiB + mobilenet_mem = 14 * MiB + mobilenet_int8_mem = 3.5 * MiB + dscnn_mem = 500 * KiB -# --- Outputs (formatted strings for prose) --- -dlrm_str = f"{dlrm_mem_value.to(GB).magnitude:.0f} GB" -gpt2_str = f"{gpt2_mem_value.to(GiB).magnitude:.0f} GB" -resnet_str = f"{resnet_mem_value.to(MiB).magnitude:.0f} MB" -mobilenet_str = f"{mobilenet_mem_value.to(MiB).magnitude:.0f} MB" -mobilenet_int8_str = f"{mobilenet_int8_mem_value.to(MiB).magnitude:.1f} MB" -dscnn_str = f"{dscnn_mem_value.to(KiB).magnitude:.0f} KB" + # ┌── 2. CALCULATION (The Physics) ──────────────────────────────────────── + dlrm_mobile = _get_ratio(dlrm_mem, mobile_mem) + dlrm_tiny = _get_ratio(dlrm_mem, tiny_mem) + gpt2_mobile = _get_ratio(gpt2_mem, mobile_mem) + gpt2_tiny = _get_ratio(gpt2_mem, tiny_mem) + resnet_tiny = _get_ratio(resnet_mem, tiny_mem) + mobilenet_tiny = _get_ratio(mobilenet_mem, tiny_mem) + mobilenet_int8_tiny = _get_ratio(mobilenet_int8_mem, tiny_mem) -cloud_cap_str = f"~{cloud_mem_value.to(GiB).magnitude:.0f} GB" -mobile_cap_str = f"~{mobile_mem_value.to(GiB).magnitude:.0f} GB" -tiny_cap_str = f"~{tiny_mem_value.to(KiB).magnitude:.0f} KB" + # ┌── 3. INVARIANTS (Guardrails) ────────────────────────────────────────── + # DS-CNN always fits TinyML — sanity check + assert _get_ratio(dscnn_mem, tiny_mem) == "ok", "DS-CNN must fit in TinyML device." -dlrm_mobile_str = dlrm_mobile_value -dlrm_tiny_str = dlrm_tiny_value -gpt2_mobile_str = gpt2_mobile_value -gpt2_tiny_str = gpt2_tiny_value -resnet_tiny_str = resnet_tiny_value -mobilenet_tiny_str = mobilenet_tiny_value -mobilenet_int8_tiny_str = mobilenet_int8_tiny_value -dscnn_tiny_str = "ok" + # ┌── 4. OUTPUTS (Formatting) ───────────────────────────────────────────── + dlrm_str = f"{dlrm_mem.m_as(GB):.0f} GB" + gpt2_str = f"{gpt2_mem.m_as(GiB):.0f} GB" + resnet_str = f"{resnet_mem.m_as(MiB):.0f} MB" + mobilenet_str = f"{mobilenet_mem.m_as(MiB):.0f} MB" + mobilenet_int8_str = f"{mobilenet_int8_mem.m_as(MiB):.1f} MB" + dscnn_str = f"{dscnn_mem.m_as(KiB):.0f} KB" + cloud_cap_str = f"~{cloud_mem.m_as(GiB):.0f} GB" + mobile_cap_str = f"~{mobile_mem.m_as(GiB):.0f} GB" + tiny_cap_str = f"~{tiny_mem.m_as(KiB):.0f} KB" + dlrm_mobile_str = dlrm_mobile + dlrm_tiny_str = dlrm_tiny + gpt2_mobile_str = gpt2_mobile + gpt2_tiny_str = gpt2_tiny + resnet_tiny_str = resnet_tiny + mobilenet_tiny_str = mobilenet_tiny + mobilenet_int8_tiny_str = mobilenet_int8_tiny + dscnn_tiny_str = "ok" + +# ┌── EXPORTS (Bridge to Text) ───────────────────────────────────────────────── +dlrm_str = ModelDeviceComparison.dlrm_str +gpt2_str = ModelDeviceComparison.gpt2_str +resnet_str = ModelDeviceComparison.resnet_str +mobilenet_str = ModelDeviceComparison.mobilenet_str +mobilenet_int8_str = ModelDeviceComparison.mobilenet_int8_str +dscnn_str = ModelDeviceComparison.dscnn_str +cloud_cap_str = ModelDeviceComparison.cloud_cap_str +mobile_cap_str = ModelDeviceComparison.mobile_cap_str +tiny_cap_str = ModelDeviceComparison.tiny_cap_str +dlrm_mobile_str = ModelDeviceComparison.dlrm_mobile_str +dlrm_tiny_str = ModelDeviceComparison.dlrm_tiny_str +gpt2_mobile_str = ModelDeviceComparison.gpt2_mobile_str +gpt2_tiny_str = ModelDeviceComparison.gpt2_tiny_str +resnet_tiny_str = ModelDeviceComparison.resnet_tiny_str +mobilenet_tiny_str = ModelDeviceComparison.mobilenet_tiny_str +mobilenet_int8_tiny_str = ModelDeviceComparison.mobilenet_int8_tiny_str +dscnn_tiny_str = ModelDeviceComparison.dscnn_tiny_str ``` | **Model** | **Memory** **(Runtime)** | **Storage** **(Weights)** | **Cloud** **(`{python} cloud_cap_str`)** | **Mobile** **(`{python} mobile_cap_str`)** | **TinyML** **(`{python} tiny_cap_str`)** | @@ -600,7 +658,6 @@ Optimization is about trading one resource for another. Each deployment context above imposes a binding constraint: memory capacity on mobile devices, latency on real-time systems, energy on battery-powered sensors. The optimization techniques that follow address these constraints at three successive levels of the stack. We begin with structural methods that modify *what* computations occur, reducing the model's parameter count and operation count to fit tighter memory and compute budgets. We then turn to precision techniques that reduce how many bits represent each value, directly shrinking memory footprint and accelerating arithmetic. Finally, we address architectural approaches that improve how efficiently the remaining operations execute on physical hardware, closing the gap between theoretical savings and measured performance. - ## Structural Optimization {#sec-model-compression-structural-optimization-ee93} \index{Model Compression!structural optimization} @@ -2764,7 +2821,6 @@ Test your understanding of the structural optimization techniques covered so far - [ ] Can you identify when to choose Neural Architecture Search over manual architecture design? Consider the trade-offs in computational cost, design space coverage, and hardware-specific optimization. ::: - ## Quantization and Precision {#sec-model-compression-quantization-precision-cd46} \index{Model Compression!precision optimization} @@ -3690,44 +3746,57 @@ Compare the two mapping diagrams side by side in @fig-calibration-ranges. Symmet # │ zero_point_str, x_val_str, x_q_str, x_recon_str # └───────────────────────────────────────────────────────────────────────────── from mlsys.formatting import fmt, check -from mlsys.constants import KIB_TO_BYTES -# --- Inputs (activation range example) --- -alpha_value = -1.0 -beta_value = 3.0 -bits_value = 8 -x_val_value = 0.0 # value to quantize +class QuantizationMathCalc: + """Derive affine quantization parameters: scale and zero-point for [-1.0, 3.0] → UINT8.""" -# --- Process (calculate affine parameters) --- -# 1. Calculate Scale (s) -# s = (beta - alpha) / (2^b - 1) -int_steps_value = 2**bits_value - 1 -scale_value = (beta_value - alpha_value) / int_steps_value + # ┌── 1. PARAMETERS (Inputs) ────────────────────────────────────────────── + alpha = -1.0 # activation range min + beta = 3.0 # activation range max + bits = 8 # target bit-width + x_val = 0.0 # value to quantize -# 2. Calculate Zero-Point (z) -# z = round(-alpha / s) -# Note: z maps the real value 0.0 to an integer -zero_point_value = round(-alpha_value / scale_value) + # ┌── 2. CALCULATION (The Physics) ──────────────────────────────────────── + # 1. Scale: s = (beta - alpha) / (2^b - 1) + int_steps = 2**bits - 1 + scale = (beta - alpha) / int_steps -# 3. Quantize a value -# x_q = clamp(round(x / s) + z, 0, 2^b - 1) -x_q_raw = round(x_val_value / scale_value) + zero_point_value -x_q_value = max(0, min(int_steps_value, x_q_raw)) + # 2. Zero-point: z = round(-alpha / s) + zero_point = round(-alpha / scale) -# 4. Dequantize (reconstruct) -# x_recon = (x_q - z) * s -x_recon_value = (x_q_value - zero_point_value) * scale_value + # 3. Quantize: x_q = clamp(round(x/s) + z, 0, 2^b - 1) + x_q_raw = round(x_val / scale) + zero_point + x_q = max(0, min(int_steps, x_q_raw)) -# --- Outputs (formatted strings for prose) --- -alpha_str = fmt(alpha_value, precision=1, commas=False) # "-1.0" -beta_str = fmt(beta_value, precision=1, commas=False) # "3.0" -range_str = fmt(beta_value - alpha_value, precision=1, commas=False) # "4.0" -steps_str = f"{int_steps_value}" # "255" -scale_str = fmt(scale_value, precision=4, commas=False) # "0.0157" -zero_point_str = f"{int(zero_point_value)}" # "64" -x_val_str = fmt(x_val_value, precision=1, commas=False) # "0.0" -x_q_str = f"{int(x_q_value)}" # "64" -x_recon_str = fmt(x_recon_value, precision=2, commas=False) # "0.00" + # 4. Dequantize: x_recon = (x_q - z) * s + x_recon = (x_q - zero_point) * scale + + # ┌── 3. INVARIANTS (Guardrails) ────────────────────────────────────────── + check(scale > 0, "Scale must be positive.") + check(0 <= zero_point <= int_steps, "Zero-point must be in valid integer range.") + check(abs(x_recon - x_val) < scale, "Reconstruction error must be less than one step size.") + + # ┌── 4. OUTPUTS (Formatting) ───────────────────────────────────────────── + alpha_str = fmt(alpha, precision=1, commas=False) # "-1.0" + beta_str = fmt(beta, precision=1, commas=False) # "3.0" + range_str = fmt(beta - alpha, precision=1, commas=False) # "4.0" + steps_str = f"{int_steps}" # "255" + scale_str = fmt(scale, precision=4, commas=False) # "0.0157" + zero_point_str = f"{int(zero_point)}" # "64" + x_val_str = fmt(x_val, precision=1, commas=False) # "0.0" + x_q_str = f"{int(x_q)}" # "64" + x_recon_str = fmt(x_recon, precision=2, commas=False) # "0.00" + +# ┌── EXPORTS (Bridge to Text) ───────────────────────────────────────────────── +alpha_str = QuantizationMathCalc.alpha_str +beta_str = QuantizationMathCalc.beta_str +range_str = QuantizationMathCalc.range_str +steps_str = QuantizationMathCalc.steps_str +scale_str = QuantizationMathCalc.scale_str +zero_point_str = QuantizationMathCalc.zero_point_str +x_val_str = QuantizationMathCalc.x_val_str +x_q_str = QuantizationMathCalc.x_q_str +x_recon_str = QuantizationMathCalc.x_recon_str ``` ::: {.callout-notebook title="Calculating Scale and Zero-Point"} @@ -4326,7 +4395,6 @@ Yet practitioners often discover a frustrating gap between theory and practice: The gap arises from several sources. Sparse matrices stored in dense format waste memory bandwidth loading zeros—the hardware cannot skip what it does not know is zero. Operations that could run in parallel execute sequentially due to data dependencies the compiler cannot resolve. Simple inputs receive the same computational budget as complex ones because the model has no mechanism to exit early. Closing the gap between "optimized on paper" and "optimized in practice" is the domain of our third optimization dimension: **architectural efficiency**. This dimension ensures that structural and precision optimizations translate into real-world speedups by aligning computation patterns with hardware capabilities. - ## Architectural Efficiency {#sec-model-compression-architectural-efficiency-8dd3} Architectural efficiency optimization ensures that computations execute efficiently on target hardware by aligning model operations with processor capabilities and memory hierarchies. Where representation optimization determines *what* computations to perform and precision optimization determines *how precisely* to compute, architectural efficiency addresses *how* operations are scheduled, memory is accessed, and workloads adapt to input characteristics. This third dimension closes the gap between theoretical compression ratios and real-world speedups. @@ -4452,77 +4520,102 @@ Beyond reducing what data must be stored, substantial efficiency gains emerge fr # │ kernels_fused_str, saved_latency_ms_str # └───────────────────────────────────────────────────────────────────────────── from mlsys.formatting import fmt, check -from mlsys.constants import KIB_TO_BYTES +from mlsys.constants import KIB_TO_BYTES, MILLION -# --- Inputs (Conv-BN-ReLU) --- -conv_channels_value = 256 -conv_spatial_value = 28 -bytes_per_element_value = 4 +class FusionCalc: + """Quantify latency and bandwidth benefits of Conv-BN-ReLU operator fusion on ResNet-50.""" -# GEMM -gemm_hidden_value = 768 -gemm_seq_value = 512 + # ┌── 1. PARAMETERS (Inputs) ────────────────────────────────────────────── + # Conv-BN-ReLU layer geometry + conv_channels = 256 + conv_spatial = 28 + bytes_per_element = 4 # FP32 -# Memory Bandwidth Analysis (ResNet-50 layer) -# Feature map: 256 channels × 28 × 28 spatial × 4 bytes/element (FP32) -feat_map_mb_value = conv_channels_value * conv_spatial_value * conv_spatial_value * bytes_per_element_value / MILLION # SI MB -weights_mb_value = 2.4 -bn_params_mb_value = 0.002 + # GEMM geometry + gemm_hidden = 768 + gemm_seq = 512 -# Kernel Launch -kernels_unfused_value = 159 -kernels_fused_value = 53 -latency_per_kernel_us_value = 10 + # ResNet-50 layer memory baseline + weights_mb = 2.4 + bn_params_mb = 0.002 -# --- Process --- -# Conv-BN-ReLU intermediate -conv_bn_relu_intermediate_bytes = 2 * conv_channels_value * conv_spatial_value * conv_spatial_value * bytes_per_element_value -conv_bn_relu_intermediate_mb_value = conv_bn_relu_intermediate_bytes / (1024**2) + # Kernel launch overhead + kernels_unfused = 159 + kernels_fused = 53 + latency_per_kernel_us = 10 -# GEMM intermediate -gemm_intermediate_bytes = gemm_hidden_value * gemm_seq_value * bytes_per_element_value -gemm_intermediate_mb_value = gemm_intermediate_bytes / (1024**2) + # ┌── 2. CALCULATION (The Physics) ──────────────────────────────────────── + # Feature map size (SI MB) + feat_map_mb = conv_channels * conv_spatial * conv_spatial * bytes_per_element / MILLION -# Bandwidth Analysis -unfused_conv_mb_value = feat_map_mb_value * 2 + weights_mb_value -unfused_bn_mb_value = feat_map_mb_value * 2 + bn_params_mb_value -unfused_relu_mb_value = feat_map_mb_value * 2 -total_unfused_mb_value = unfused_conv_mb_value + unfused_bn_mb_value + unfused_relu_mb_value + # Conv-BN-ReLU intermediate (2 feature maps written: conv→BN boundary) + conv_bn_relu_intermediate_mb = ( + 2 * conv_channels * conv_spatial * conv_spatial * bytes_per_element / (1024**2) + ) -total_fused_mb_value = feat_map_mb_value * 2 + weights_mb_value -bandwidth_reduction_pct_value = (1 - total_fused_mb_value / total_unfused_mb_value) * 100 + # GEMM intermediate + gemm_intermediate_mb = gemm_hidden * gemm_seq * bytes_per_element / (1024**2) -# Kernel Launch -saved_latency_us_value = (kernels_unfused_value - kernels_fused_value) * latency_per_kernel_us_value -saved_latency_ms_value = saved_latency_us_value / 1000 + # Unfused bandwidth: Conv (feat*2 + weights) + BN (feat*2 + bn) + ReLU (feat*2) + unfused_conv_mb = feat_map_mb * 2 + weights_mb + unfused_bn_mb = feat_map_mb * 2 + bn_params_mb + unfused_relu_mb = feat_map_mb * 2 + total_unfused_mb = unfused_conv_mb + unfused_bn_mb + unfused_relu_mb -# V100 timing analysis (memory-bound) -v100_bw_gbs_local_value = v100_bw_gbs_value # from earlier cell -unfused_time_us_value = total_unfused_mb_value / v100_bw_gbs_local_value * 1000 # MB / (GB/s) * 1000 = us -fused_time_us_value = total_fused_mb_value / v100_bw_gbs_local_value * 1000 -fusion_speedup_value = unfused_time_us_value / fused_time_us_value + # Fused bandwidth: read input + weights once, write output once + total_fused_mb = feat_map_mb * 2 + weights_mb + bandwidth_reduction_pct = (1 - total_fused_mb / total_unfused_mb) * 100 -# --- Outputs (formatted strings for prose) --- -conv_bn_relu_intermediate_mb_str = fmt(conv_bn_relu_intermediate_mb_value, precision=1, commas=False) -gemm_intermediate_mb_str = fmt(gemm_intermediate_mb_value, precision=1, commas=False) + # Kernel launch savings + saved_latency_us = (kernels_unfused - kernels_fused) * latency_per_kernel_us + saved_latency_ms = saved_latency_us / 1000 -feat_map_kb_str = fmt(feat_map_mb_value * 1000, precision=0, commas=False) -weights_mb_str = fmt(weights_mb_value, precision=1, commas=False) -bn_params_kb_str = fmt(bn_params_mb_value * KIB_TO_BYTES, precision=0, commas=False) + # V100 timing (memory-bound): MB / (GB/s) * 1000 = µs + unfused_time_us = total_unfused_mb / v100_bw_gbs_value * 1000 + fused_time_us = total_fused_mb / v100_bw_gbs_value * 1000 + fusion_speedup = unfused_time_us / fused_time_us -unfused_conv_mb_str = fmt(unfused_conv_mb_value, precision=1, commas=False) -unfused_bn_mb_str = fmt(unfused_bn_mb_value, precision=1, commas=False) -unfused_relu_mb_str = fmt(unfused_relu_mb_value, precision=1, commas=False) -total_unfused_mb_str = fmt(total_unfused_mb_value, precision=1, commas=False) -total_fused_mb_str = fmt(total_fused_mb_value, precision=1, commas=False) -bandwidth_reduction_pct_str = fmt(bandwidth_reduction_pct_value, precision=0, commas=False) + # ┌── 3. INVARIANTS (Guardrails) ────────────────────────────────────────── + check(bandwidth_reduction_pct > 40, "Fusion should reduce bandwidth by more than 40%.") + check(fusion_speedup > 1, "Fused execution must be faster than unfused.") -kernels_unfused_str = fmt(kernels_unfused_value, precision=0, commas=False) -kernels_fused_str = fmt(kernels_fused_value, precision=0, commas=False) -saved_latency_ms_str = fmt(saved_latency_ms_value, precision=0, commas=False) -unfused_time_us_str = fmt(unfused_time_us_value, precision=0, commas=False) -fused_time_us_str = fmt(fused_time_us_value, precision=1, commas=False) -fusion_speedup_str = fmt(fusion_speedup_value, precision=2, commas=False) + # ┌── 4. OUTPUTS (Formatting) ───────────────────────────────────────────── + conv_bn_relu_intermediate_mb_str = fmt(conv_bn_relu_intermediate_mb, precision=1, commas=False) + gemm_intermediate_mb_str = fmt(gemm_intermediate_mb, precision=1, commas=False) + feat_map_kb_str = fmt(feat_map_mb * 1000, precision=0, commas=False) + weights_mb_str = fmt(weights_mb, precision=1, commas=False) + bn_params_kb_str = fmt(bn_params_mb * KIB_TO_BYTES, precision=0, commas=False) + unfused_conv_mb_str = fmt(unfused_conv_mb, precision=1, commas=False) + unfused_bn_mb_str = fmt(unfused_bn_mb, precision=1, commas=False) + unfused_relu_mb_str = fmt(unfused_relu_mb, precision=1, commas=False) + total_unfused_mb_str = fmt(total_unfused_mb, precision=1, commas=False) + total_fused_mb_str = fmt(total_fused_mb, precision=1, commas=False) + bandwidth_reduction_pct_str = fmt(bandwidth_reduction_pct, precision=0, commas=False) + kernels_unfused_str = fmt(kernels_unfused, precision=0, commas=False) + kernels_fused_str = fmt(kernels_fused, precision=0, commas=False) + saved_latency_ms_str = fmt(saved_latency_ms, precision=0, commas=False) + unfused_time_us_str = fmt(unfused_time_us, precision=0, commas=False) + fused_time_us_str = fmt(fused_time_us, precision=1, commas=False) + fusion_speedup_str = fmt(fusion_speedup, precision=2, commas=False) + +# ┌── EXPORTS (Bridge to Text) ───────────────────────────────────────────────── +conv_bn_relu_intermediate_mb_str = FusionCalc.conv_bn_relu_intermediate_mb_str +gemm_intermediate_mb_str = FusionCalc.gemm_intermediate_mb_str +feat_map_kb_str = FusionCalc.feat_map_kb_str +weights_mb_str = FusionCalc.weights_mb_str +bn_params_kb_str = FusionCalc.bn_params_kb_str +unfused_conv_mb_str = FusionCalc.unfused_conv_mb_str +unfused_bn_mb_str = FusionCalc.unfused_bn_mb_str +unfused_relu_mb_str = FusionCalc.unfused_relu_mb_str +total_unfused_mb_str = FusionCalc.total_unfused_mb_str +total_fused_mb_str = FusionCalc.total_fused_mb_str +bandwidth_reduction_pct_str = FusionCalc.bandwidth_reduction_pct_str +kernels_unfused_str = FusionCalc.kernels_unfused_str +kernels_fused_str = FusionCalc.kernels_fused_str +saved_latency_ms_str = FusionCalc.saved_latency_ms_str +unfused_time_us_str = FusionCalc.unfused_time_us_str +fused_time_us_str = FusionCalc.fused_time_us_str +fusion_speedup_str = FusionCalc.fusion_speedup_str ``` #### Operator Fusion {#sec-model-compression-operator-fusion-ac1d} @@ -4594,16 +4687,28 @@ def conv_bn_relu_fused(input, weight, gamma, beta, mean, var): # └───────────────────────────────────────────────────────────────────────────── from mlsys.formatting import fmt, check, md_math -# --- Inputs (transfer counts) --- -unfused_transfers_value = 6 # read/write for each of conv, BN, ReLU -fused_transfers_value = 2 # read input, write output +class ConvFusionCalc: + """Demonstrate 3x memory traffic reduction from Conv-BN-ReLU fusion (6 transfers → 2).""" -# --- Process --- -transfer_reduction_value = unfused_transfers_value / fused_transfers_value + # ┌── 1. PARAMETERS (Inputs) ────────────────────────────────────────────── + unfused_transfers = 6 # read/write for Conv, BN, ReLU + fused_transfers = 2 # read input, write output -# --- Outputs (formatted strings for prose) --- -transfer_reduction_str = fmt(transfer_reduction_value, precision=0, commas=False) -conv_bn_relu_mem_md = md_math(f"2 \\times 256 \\times 28 \\times 28 \\times 4 \\text{{ bytes}} \\approx \\text{{{conv_bn_relu_intermediate_mb_str} MB}}") + # ┌── 2. CALCULATION (The Physics) ──────────────────────────────────────── + transfer_reduction = unfused_transfers / fused_transfers + + # ┌── 3. INVARIANTS (Guardrails) ────────────────────────────────────────── + check(transfer_reduction == 3, "Conv-BN-ReLU fusion must yield exactly 3x transfer reduction.") + + # ┌── 4. OUTPUTS (Formatting) ───────────────────────────────────────────── + transfer_reduction_str = fmt(transfer_reduction, precision=0, commas=False) + conv_bn_relu_mem_md = md_math( + f"2 \\times 256 \\times 28 \\times 28 \\times 4 \\text{{ bytes}} \\approx \\text{{{conv_bn_relu_intermediate_mb_str} MB}}" + ) + +# ┌── EXPORTS (Bridge to Text) ───────────────────────────────────────────────── +transfer_reduction_str = ConvFusionCalc.transfer_reduction_str +conv_bn_relu_mem_md = ConvFusionCalc.conv_bn_relu_mem_md ``` The arithmetic operations remain identical, but memory traffic drops from 6 transfers to 2 transfers (`{python} transfer_reduction_str` $\times$ reduction). For a ResNet-50 layer with 256 channels and spatial size $28 \times 28$, this eliminates `{python} conv_bn_relu_mem_md` of intermediate memory traffic per layer. @@ -6276,7 +6381,6 @@ Unlike software functions that compose predictably, optimization techniques inte With the three optimization dimensions now fully explored, practitioners need systematic guidance for translating this knowledge into deployment decisions. - ## Technique Selection {#sec-model-compression-technique-selection-ba16} An engineer deploying a transformer model faces a concrete decision: the model exceeds the target device's memory by 3 $\times$, inference latency is 4 $\times$ above the SLO, and the power budget allows no more than 2 W sustained. Should she quantize first, prune first, distill to a smaller architecture, or combine techniques? The answer depends on which constraint is binding, what accuracy loss is tolerable, and how much engineering time is available. This section provides structured guidance for navigating that decision. @@ -6314,7 +6418,6 @@ These choices also depend on the available engineering budget. When fine-tuning This decision framework provides starting points for individual technique selection. Validating that a chosen technique actually achieves its intended goal requires systematic profiling and measurement, which @sec-model-compression-efficiency-measurement-2424 formalizes in detail. However, production deployments rarely rely on a single technique. Combining pruning with quantization, or distillation with hardware-aware design, introduces interaction effects that can either amplify benefits or create unexpected accuracy degradation. The following section addresses how to sequence and combine techniques effectively. - ## Optimization Strategies {#sec-model-compression-optimization-strategies-f2f6} The decision framework above guides individual technique selection, but the largest optimization gains emerge from combining multiple techniques. Because pruning, quantization, and architectural efficiency operate at different levels of the stack, they provide multiplicative benefits when sequenced appropriately. @@ -6528,7 +6631,6 @@ This example illustrates why sequencing matters: pruning first concentrates impo With dozens of techniques across three optimization dimensions, rigorous measurement is essential for validating that optimizations achieve their intended goals. A practitioner who prunes, quantizes, and fuses without profiling the actual impact on target hardware is optimizing blindly. - ## Efficiency Measurement {#sec-model-compression-efficiency-measurement-2424} A model quantized to INT8 should be 4 $\times$ smaller and roughly 3 $\times$ faster, but does it actually achieve those gains on the target hardware? Theoretical compression ratios and measured deployment improvements often diverge, sometimes dramatically, because real speedups depend on memory hierarchy effects, kernel implementations, and hardware utilization patterns that theory alone cannot predict. Translating theoretical compression ratios into measurable deployment improvements therefore requires systematic profiling and evaluation. @@ -6566,7 +6668,6 @@ With these comprehensive baselines in place, the measurement framework must trac Rigorous measurement tells practitioners *whether* their optimizations succeeded, but the measurements themselves require tooling to perform. Profiling, quantization, pruning, and deployment all depend on software frameworks that automate otherwise prohibitively complex workflows. We turn now to the implementation tools that make these techniques practical. - ## Implementation Tools {#sec-model-compression-implementation-tools-4990} Understanding optimization techniques is necessary but not sufficient; practical implementation relies on robust software support. Without framework tooling, quantization would require manual modification of model definitions and careful insertion of quantization operations throughout the network, while pruning would demand direct manipulation of weight tensors. Both become prohibitively complex as models scale. @@ -6655,7 +6756,6 @@ Sparsity heat maps show sparsity distribution across layers (@fig-sparse-heat-ma With the implementation tools and visualization capabilities established, the natural question is: how do these techniques compare when a practitioner must choose among them? Each optimization approach carries distinct trade-offs in accuracy, training cost, and hardware requirements, and a structured comparison clarifies which to reach for first. - ## Technique Comparison {#sec-model-compression-technique-comparison-3142} A comparative analysis across the three major approaches reveals how each addresses distinct aspects of the efficiency-accuracy trade-off. Pruning works best when sparse computation hardware is available and when reducing floating-point operations is critical. Quantization provides the most versatile approach with broad hardware support, making it ideal for diverse deployment scenarios. Knowledge distillation requires significant computational investment but produces consistently high-quality compressed models, making it the right choice when accuracy preservation is paramount. @tbl-optimization-comparison summarizes these trade-offs for systematic technique selection. @@ -6673,7 +6773,6 @@ These techniques combine synergistically, with quantization often applied after With the complete optimization toolkit now surveyed—from individual techniques through combination strategies—the most instructive lessons often come not from what works but from what fails. The following fallacies and pitfalls capture the most common mistakes engineers make when applying these techniques, each grounded in the quantitative trade-offs we have established throughout the chapter. - ## Fallacies and Pitfalls {#sec-model-compression-fallacies-pitfalls-1b5e} ```{python} @@ -6773,7 +6872,6 @@ Teams apply post-training quantization (PTQ) to avoid retraining and achieve 96. Teams achieve `{python} int8_size_reduction_str` $\times$ model size reduction through INT8 quantization and expect `{python} int8_size_reduction_str` $\times$ memory savings in deployment. In practice, runtime overhead erodes compression gains. Dequantization kernels add `{python} dequant_overhead_str`% latency overhead converting INT8 weights back to FP16. Pruned models with irregular sparsity achieve only 12% latency reduction despite `{python} param_removal_str`% parameter removal because hardware cannot skip zeroed weights efficiently. As @sec-model-compression-profiling-opportunity-analysis-477f demonstrates, a BERT model pruned to 50% sparsity and quantized to INT8 achieves `{python} actual_speedup_str`% end-to-end speedup rather than the expected `{python} expected_speedup_str` $\times$, because unstructured sparsity creates irregular memory access. Production workflows must profile *deployed* latency on target hardware, not extrapolate from compression ratios. - ## Summary {#sec-model-compression-summary-8229} Model compression is not a bag of tricks but an engineering discipline built on three complementary dimensions: *structural optimization* determines what the model computes, *precision optimization* determines how precisely it computes, and *architectural optimization* determines how efficiently those computations execute on physical hardware. The most important lesson of this chapter is that these dimensions compose multiplicatively. Pruning alone might achieve 2 $\times$ compression; quantization alone might achieve 4 $\times$; but pruning, distillation, and quantization applied together can achieve 16 $\times$ — as BERT's compression from 440 MB to 28 MB demonstrates. The second lesson is equally important: theoretical compression ratios lie. A 4 $\times$ reduction in parameters translates to 4 $\times$ latency improvement only when the optimization aligns with the hardware's execution model. Unstructured sparsity on hardware that lacks sparse kernels achieves almost nothing; INT8 quantization on hardware without INT8 units achieves even less. Profile on target hardware, not paper metrics. diff --git a/book/quarto/contents/vol2/backmatter/appendix_c3.qmd b/book/quarto/contents/vol2/backmatter/appendix_c3.qmd index 883cdcfac..4a9c47209 100644 --- a/book/quarto/contents/vol2/backmatter/appendix_c3.qmd +++ b/book/quarto/contents/vol2/backmatter/appendix_c3.qmd @@ -21,6 +21,26 @@ When training throughput is low, check MFU, communication fraction, and goodput ```{python} #| label: appendix-c3-setup #| echo: false +# ┌───────────────────────────────────────────────────────────────────────────── +# │ C³ TAXONOMY — MASTER COMPUTATION +# ├───────────────────────────────────────────────────────────────────────────── +# │ Context: PERSISTENT — All values used throughout the C³ Taxonomy appendix: +# │ @tbl-c3-dam-mapping, @tbl-c3-diagnostic-summary, @tbl-c3-traffic-light, +# │ @tbl-c3-bottleneck-actions, three case studies, scorecard, and exercises. +# │ +# │ Goal: Provide all C³ diagnostic constants — case study parameters, effective +# │ FLOPS decomposition, and threshold strings — for the fleet-scale +# │ bottleneck classification reference appendix. +# │ Show: See individual section prose for formatted values. This cell provides +# │ the physics; string attributes are display-ready. +# │ How: calc_effective_flops() with MFU, scaling efficiency, and goodput ratio; +# │ all results as raw floats extracted via .m_as() or .magnitude where unitless. +# │ +# │ Imports: mlsys.constants (H100_FLOPS_FP16_TENSOR, MFU_*, SCALING_EFF_*, OVERHEAD_*, …) +# │ mlsys.formulas (calc_effective_flops) +# │ mlsys.formatting (fmt, check, md_math) +# │ Exports: C3 = C3Taxonomy (accessed as C3.attribute in downstream cells) +# └───────────────────────────────────────────────────────────────────────────── import math from mlsys.constants import ( @@ -35,15 +55,6 @@ from mlsys.constants import ( from mlsys.formatting import fmt, check, md_math from mlsys.formulas import calc_effective_flops -# ============================================================================= -# PURPOSE -# ============================================================================= -# Purpose: Compute all values for the C³ Taxonomy appendix. -# Used in: Case studies, effective FLOPS, scorecard, and inline prose. -# -# Philosophy: C³ parallels D·A·M — three MECE axes for fleet-scale diagnosis. -# Every computed value traces back to constants.py. - class C3Taxonomy: """Namespace for C³ diagnostic examples.""" @@ -71,7 +82,7 @@ class C3Taxonomy: case3_oh_maintenance_pct = OVERHEAD_MAINTENANCE * 100 # Effective FLOPS calculation: 100K GPU cluster - h100_tflops = H100_FLOPS_FP16_TENSOR.to(TFLOPs / second).magnitude + h100_tflops = H100_FLOPS_FP16_TENSOR.m_as(TFLOPs / second) n_gpus_eff = 100_000 peak_pflops = n_gpus_eff * h100_tflops / 1000 # PFLOPs goodput_all = 1.0 - (OVERHEAD_PIPELINE_BUBBLE + @@ -80,7 +91,7 @@ class C3Taxonomy: OVERHEAD_MAINTENANCE) effective_pflops = calc_effective_flops( peak_pflops, MFU_TRAINING_HIGH, SCALING_EFF_8192GPU, goodput_all - ) + ).magnitude # extract float; calc_effective_flops returns Quantity since formulas.py upgrade c3_tax = peak_pflops / effective_pflops eff_fraction = effective_pflops / peak_pflops @@ -445,12 +456,8 @@ The gap between scaling-law predictions and observed training outcomes is, in la ```{python} #| label: appendix-c3-effective-flops #| echo: false - -# ============================================================================= -# PURPOSE -# ============================================================================= -# Purpose: Format effective FLOPS values for the worked example. -# Used in: Effective FLOPS worked example prose. +# Goal: Alias C3Taxonomy strings for the 100K-GPU effective FLOPS callout prose. +# Exports: peak_str, eff_str, eff_pct_str, c3_tax_str, mfu_str, scaling_str, goodput_str peak_str = C3.peak_pflops_str eff_str = C3.effective_pflops_str diff --git a/book/quarto/contents/vol2/backmatter/appendix_fleet.qmd b/book/quarto/contents/vol2/backmatter/appendix_fleet.qmd index fbd64af2a..59ea6d808 100644 --- a/book/quarto/contents/vol2/backmatter/appendix_fleet.qmd +++ b/book/quarto/contents/vol2/backmatter/appendix_fleet.qmd @@ -15,6 +15,23 @@ This appendix collects the reference numbers and compact models for fleet-scale ```{python} #| label: appendix-fleet-setup #| echo: false +# ┌───────────────────────────────────────────────────────────────────────────── +# │ FLEET FOUNDATIONS — MASTER COMPUTATION +# ├───────────────────────────────────────────────────────────────────────────── +# │ Context: PERSISTENT — All values used throughout the Fleet Foundations +# │ appendix: hardware reference table, MTBF tables, checkpoint sizing, +# │ effective FLOPS, comm-compute ratio, and all prose inline values. +# │ +# │ Goal: Provide all quantitative fleet engineering constants in one place +# │ for the "Numbers Every Fleet Engineer Should Know" reference appendix. +# │ Show: See individual section cells for formatted values. This cell provides +# │ the physics; formatting cells convert to display strings. +# │ How: pint Quantities from mlsys.constants; fleet formulas from formulas.py; +# │ all results as typed Quantities or raw floats via .m_as(). +# │ +# │ Imports: mlsys.constants (*), mlsys.formulas (calc_*), mlsys.formatting (fmt, check) +# │ Exports: FF = FleetFoundations (accessed as FF.attribute in downstream cells) +# └───────────────────────────────────────────────────────────────────────────── import math from mlsys.constants import * @@ -26,27 +43,13 @@ from mlsys.formulas import ( calc_young_daly_interval, calc_checkpoint_size ) -# ============================================================================= -# PURPOSE -# ============================================================================= -# Purpose: Compute all values for the Fleet Foundations appendix. -# Used in: Reference tables, worked examples, and inline prose throughout. -# -# Philosophy: Fleet-scale numbers emphasize RATIOS between tiers and -# SCALING BEHAVIOR with cluster size. Absolute values are -# current-generation snapshots; ratios persist across generations. - -# ============================================================================= -# NETWORK HIERARCHY -# ============================================================================= - class FleetFoundations: """Namespace for fleet-scale reference calculations.""" # ── Communication Numbers ──────────────────────────────────────────────── # Bandwidth hierarchy (GB/s) - nvlink_h100_bw = int(NVLINK_H100_BW.to(GB / second).magnitude) - pcie5_bw = int(PCIE_GEN5_BW.to(GB / second).magnitude) + nvlink_h100_bw = int(NVLINK_H100_BW.m_as(GB / second)) + pcie5_bw = int(PCIE_GEN5_BW.m_as(GB / second)) ib_ndr_bw = INFINIBAND_NDR_BW_GBS ib_hdr_bw = INFINIBAND_HDR_BW_GBS ib_xdr_bw = INFINIBAND_XDR_BW_GBS @@ -95,28 +98,29 @@ class FleetFoundations: mtbf_100k_h = calc_mtbf_cluster(GPU_MTTF_HOURS, cl_mega) # Convert to minutes for readability - mtbf_256_min = mtbf_256_h * 60 - mtbf_2048_min = mtbf_2048_h * 60 - mtbf_8192_min = mtbf_8192_h * 60 - mtbf_100k_min = mtbf_100k_h * 60 + mtbf_256_min = mtbf_256_h.m_as(ureg.minute) + mtbf_2048_min = mtbf_2048_h.m_as(ureg.minute) + mtbf_8192_min = mtbf_8192_h.m_as(ureg.minute) + mtbf_100k_min = mtbf_100k_h.m_as(ureg.minute) - # Failure probability for a 24-hour job (using hours consistently) - pfail_256_24h = calc_failure_probability(mtbf_256_h, 24) - pfail_2048_24h = calc_failure_probability(mtbf_2048_h, 24) - pfail_8192_24h = calc_failure_probability(mtbf_8192_h, 24) - pfail_100k_24h = calc_failure_probability(mtbf_100k_h, 24) + # Failure probability for a 24-hour job + _24h = 24 * ureg.hour + pfail_256_24h = calc_failure_probability(mtbf_256_h, _24h) + pfail_2048_24h = calc_failure_probability(mtbf_2048_h, _24h) + pfail_8192_24h = calc_failure_probability(mtbf_8192_h, _24h) + pfail_100k_24h = calc_failure_probability(mtbf_100k_h, _24h) - # Checkpoint sizes (bytes) - ckpt_7b = calc_checkpoint_size(7e9) + # Checkpoint sizes + ckpt_7b = calc_checkpoint_size(7e9) # Quantity[byte] ckpt_70b = calc_checkpoint_size(70e9) ckpt_175b = calc_checkpoint_size(175e9) ckpt_1t = calc_checkpoint_size(1e12) - # Convert to GB - ckpt_7b_gb = ckpt_7b / 1e9 - ckpt_70b_gb = ckpt_70b / 1e9 - ckpt_175b_gb = ckpt_175b / 1e9 - ckpt_1t_tb = ckpt_1t / 1e12 + # Extract in GB/TB + ckpt_7b_gb = ckpt_7b.m_as(GB) + ckpt_70b_gb = ckpt_70b.m_as(GB) + ckpt_175b_gb = ckpt_175b.m_as(GB) + ckpt_1t_tb = ckpt_1t.m_as(TB) # Overhead budgets oh_pipeline = int(OVERHEAD_PIPELINE_BUBBLE * 100) @@ -125,20 +129,20 @@ class FleetFoundations: oh_maintenance = int(OVERHEAD_MAINTENANCE * 100) # ── Hardware Reference ─────────────────────────────────────────────────── - h100_flops = int(H100_FLOPS_FP16_TENSOR.to(TFLOPs / second).magnitude) - h100_bw_tbs = f"{H100_MEM_BW.to(TB / second).magnitude:.2f}" - h100_cap = int(H100_MEM_CAPACITY.to(GiB).magnitude) - h100_tdp = int(H100_TDP.magnitude) + h100_flops = int(H100_FLOPS_FP16_TENSOR.m_as(TFLOPs / second)) + h100_bw_tbs = f"{H100_MEM_BW.m_as(TB / second):.2f}" + h100_cap = int(H100_MEM_CAPACITY.m_as(GiB)) + h100_tdp = int(H100_TDP.m_as(watt)) - b200_flops = int(B200_FLOPS_FP16_TENSOR.to(TFLOPs / second).magnitude) - b200_bw_tbs = f"{B200_MEM_BW.to(TB / second).magnitude:.0f}" - b200_cap = int(B200_MEM_CAPACITY.to(GiB).magnitude) - b200_tdp = int(B200_TDP.magnitude) + b200_flops = int(B200_FLOPS_FP16_TENSOR.m_as(TFLOPs / second)) + b200_bw_tbs = f"{B200_MEM_BW.m_as(TB / second):.0f}" + b200_cap = int(B200_MEM_CAPACITY.m_as(GiB)) + b200_tdp = int(B200_TDP.m_as(watt)) - tpuv5_flops = int(TPUV5P_FLOPS_BF16.to(TFLOPs / second).magnitude) - tpuv5_bw_tbs = f"{TPUV5P_MEM_BW.to(TB / second).magnitude:.2f}" - tpuv5_cap = int(TPUV5P_MEM_CAPACITY.to(GiB).magnitude) - tpuv5_ici = int(TPUV5P_ICI_BW.to(GB / second).magnitude) + tpuv5_flops = int(TPUV5P_FLOPS_BF16.m_as(TFLOPs / second)) + tpuv5_bw_tbs = f"{TPUV5P_MEM_BW.m_as(TB / second):.2f}" + tpuv5_cap = int(TPUV5P_MEM_CAPACITY.m_as(GiB)) + tpuv5_ici = int(TPUV5P_ICI_BW.m_as(GB / second)) # ── Power and Sustainability ───────────────────────────────────────────── rack_trad = RACK_POWER_TRADITIONAL_KW @@ -154,17 +158,19 @@ class FleetFoundations: # ── Effective FLOPS Example ────────────────────────────────────────────── # 1024-GPU cluster, H100, realistic overheads - peak_1024 = 1024 * H100_FLOPS_FP16_TENSOR.to(TFLOPs / second).magnitude + _peak_1024_qty = 1024 * H100_FLOPS_FP16_TENSOR # Quantity[TFLOPs/s] + peak_1024 = _peak_1024_qty.m_as(TFLOPs / second) # raw float for display goodput_ratio = 1.0 - (OVERHEAD_PIPELINE_BUBBLE + OVERHEAD_CHECKPOINT + OVERHEAD_FAILURE_RECOVERY + OVERHEAD_MAINTENANCE) - eff_flops_1024 = calc_effective_flops( - peak_1024, + _eff_flops_1024_qty = calc_effective_flops( + _peak_1024_qty, MFU_TRAINING_HIGH, SCALING_EFF_1024GPU, goodput_ratio - ) + ) # Quantity[flop/second] + eff_flops_1024 = _eff_flops_1024_qty.m_as(TFLOPs / second) # raw float for display eff_fraction = eff_flops_1024 / peak_1024 # ── Invariant Checks ───────────────────────────────────────────────────── @@ -289,12 +295,8 @@ Communication defines the boundaries of parallelism. These tables quantify the b ```{python} #| label: fleet-comm-numbers #| echo: false - -# ============================================================================= -# PURPOSE -# ============================================================================= -# Purpose: Compute communication hierarchy values for inline references. -# Used in: Communication numbers tables and prose. +# Goal: Format communication bandwidth and latency strings for @tbl-fleet-bandwidth-hierarchy and @tbl-fleet-latency-hierarchy. +# Exports: nvlink_bw_str, pcie5_bw_str, ib_*_str, tpuv5_ici_str, nvlink_to_ib_str, *_lat_str # ── Bandwidth ratios ──────────────────────────────────────────────────────── nvlink_bw_str = fmt(FF.nvlink_h100_bw, precision=0) @@ -386,12 +388,8 @@ At fleet scale, coordination---failure recovery, checkpointing, and maintenance- ```{python} #| label: fleet-mtbf-table #| echo: false - -# ============================================================================= -# PURPOSE -# ============================================================================= -# Purpose: Format MTBF and failure probability values for the table. -# Used in: MTBF by cluster size table. +# Goal: Format MTBF hours, minutes, and P(failure) percentages for @tbl-fleet-mtbf. +# Exports: mtbf_256_str, mtbf_2048_str, mtbf_8192_str, mtbf_100k_str, mtbf_*_min_str, pfail_*_str mtbf_256_str = fmt(FF.mtbf_256_h, precision=1, commas=False) mtbf_2048_str = fmt(FF.mtbf_2048_h, precision=1, commas=False) @@ -432,12 +430,8 @@ Checkpointing is the primary recovery mechanism, and its cost depends on the mod ```{python} #| label: fleet-checkpoint-sizes #| echo: false - -# ============================================================================= -# PURPOSE -# ============================================================================= -# Purpose: Format checkpoint sizes for the reference table. -# Used in: Checkpoint size table. +# Goal: Format checkpoint sizes in GB/TB for @tbl-fleet-checkpoint-sizes. +# Exports: ckpt_7b_str, ckpt_70b_str, ckpt_175b_str, ckpt_1t_str ckpt_7b_str = fmt(FF.ckpt_7b_gb, precision=0) ckpt_70b_str = fmt(FF.ckpt_70b_gb, precision=0) @@ -484,12 +478,8 @@ These numbers reflect the current generation of fleet-scale hardware. Use them f ```{python} #| label: fleet-hardware-ref #| echo: false - -# ============================================================================= -# PURPOSE -# ============================================================================= -# Purpose: Format hardware reference values for the comparison table. -# Used in: Current hardware reference table. +# Goal: Format H100, B200, and TPU v5p specs for @tbl-fleet-hardware-ref. +# Exports: h100_flops_str, h100_bw_str, h100_cap_str, h100_tdp_str, b200_*, tpuv5_* h100_flops_str = fmt(FF.h100_flops, precision=0) h100_bw_str = FF.h100_bw_tbs @@ -547,36 +537,52 @@ Volume I introduced Amdahl's Law for a single machine, where the serial fraction ```{python} #| label: fleet-amdahl-example #| echo: false +# ┌───────────────────────────────────────────────────────────────────────────── +# │ FLEET AMDAHL EXAMPLE +# ├───────────────────────────────────────────────────────────────────────────── +# │ Context: @sec-fleet-foundations-amdahls-fleet worked example +# │ +# │ Goal: Compute Amdahl's Law speedup at 32/256/1024/8192 GPUs for 10% serial fraction. +# │ Show: Speedup values and the Amdahl ceiling for inline prose. +# │ How: calc_amdahls_speedup() from formulas.py; check() for invariants. +# │ +# │ Imports: mlsys.formulas (calc_amdahls_speedup), mlsys.formatting (fmt, check) +# │ Exports: s_fleet_pct_str, max_speedup_str, su_32_str, su_256_str, su_1024_str, su_8192_str +# └───────────────────────────────────────────────────────────────────────────── -# ============================================================================= -# PURPOSE -# ============================================================================= -# Purpose: Compute Amdahl's Law examples at fleet scale. -# Used in: Amdahl's Law at Fleet Scale worked example. +class FleetAmdahlExample: + """Amdahl's Law speedup at 32/256/1024/8192 GPUs for 10% serial fraction.""" -# ── PARAMETERS ────────────────────────────────────────────────────────────── -s_fleet = 0.10 # 10% serial fraction (communication + sync) -n_values = [32, 256, 1024, 8192] + # ┌── 1. PARAMETERS (Inputs) ────────────────────────────────────────────── + s_fleet = 0.10 + n_values = [32, 256, 1024, 8192] -# ── CALCULATION ───────────────────────────────────────────────────────────── -speedups = {} -for n in n_values: - su = calc_amdahls_speedup(1 - s_fleet, n) - speedups[n] = su + # ┌── 2. CALCULATION (The Physics) ──────────────────────────────────────── + speedups = {} + for _n in n_values: + speedups[_n] = calc_amdahls_speedup(1 - s_fleet, _n) -max_speedup = 1 / s_fleet + max_speedup = 1 / s_fleet -# ── INVARIANTS ────────────────────────────────────────────────────────────── -check(speedups[8192] < max_speedup, "Speedup must be below Amdahl limit") -check(max_speedup == 10.0, "Max speedup for 10% serial fraction must be 10x") + # ┌── 3. INVARIANTS (Guardrails) ────────────────────────────────────────── + check(speedups[8192] < max_speedup, "Speedup must be below Amdahl limit") + check(max_speedup == 10.0, "Max speedup for 10% serial fraction must be 10x") -# ── OUTPUTS ───────────────────────────────────────────────────────────────── -s_fleet_pct_str = "10" -max_speedup_str = fmt(max_speedup, precision=0, commas=False) -su_32_str = fmt(speedups[32], precision=1, commas=False) -su_256_str = fmt(speedups[256], precision=1, commas=False) -su_1024_str = fmt(speedups[1024], precision=1, commas=False) -su_8192_str = fmt(speedups[8192], precision=1, commas=False) + # ┌── 4. OUTPUTS (Formatting) ───────────────────────────────────────────── + s_fleet_pct_str = "10" + max_speedup_str = fmt(max_speedup, precision=0, commas=False) + su_32_str = fmt(speedups[32], precision=1, commas=False) + su_256_str = fmt(speedups[256], precision=1, commas=False) + su_1024_str = fmt(speedups[1024], precision=1, commas=False) + su_8192_str = fmt(speedups[8192], precision=1, commas=False) + +# ┌── EXPORTS (Bridge to Text) ───────────────────────────────────────────────── +s_fleet_pct_str = FleetAmdahlExample.s_fleet_pct_str +max_speedup_str = FleetAmdahlExample.max_speedup_str +su_32_str = FleetAmdahlExample.su_32_str +su_256_str = FleetAmdahlExample.su_256_str +su_1024_str = FleetAmdahlExample.su_1024_str +su_8192_str = FleetAmdahlExample.su_8192_str ``` To see the fleet-scale implications, consider a training workload where `{python} s_fleet_pct_str`% of wall-clock time is spent in synchronization, communication, and other serial overhead. Amdahl's Law gives the following speedups: @@ -604,58 +610,72 @@ When $\rho < 1$, computation dominates and communication can be overlapped. When ```{python} #| label: fleet-comm-comp-ratio #| echo: false +# ┌───────────────────────────────────────────────────────────────────────────── +# │ FLEET COMM-COMPUTE RATIO +# ├───────────────────────────────────────────────────────────────────────────── +# │ Context: @sec-fleet-foundations-comm-compute-ratio worked example (@tbl-fleet-comm-comp) +# │ +# │ Goal: Compute ρ = T_comm / T_comp for 3 scenarios: 7B DP, 350M DP, tensor-parallel. +# │ Show: AllReduce times in ms and ρ ratios for each scenario; ~0.1 for DP 7B, ~3 for DP 350M. +# │ How: calc_ring_allreduce_time() with IB NDR params; NVLink BW for tensor-parallel. +# │ +# │ Imports: mlsys.constants (INFINIBAND_NDR_BW_GBS, IB_NDR_LATENCY_US, NVLINK_H100_BW, GB, second) +# │ Exports: ar_7b_ms_str, rho_7b_str, ar_350m_ms_str, rho_350m_str, rho_tp_str +# └───────────────────────────────────────────────────────────────────────────── -# ============================================================================= -# PURPOSE -# ============================================================================= -# Purpose: Compute communication-computation ratios for different scenarios. -# Used in: Communication-computation ratio worked example. +class FleetCommCompRatio: + """Communication-to-computation ratio ρ for three parallelism scenarios.""" -# ── SCENARIO 1: Data parallelism, large model ────────────────────────────── -# 7B model, 256 GPUs, IB NDR -grad_bytes_7b = 7e9 * 2 # 7B params * 2 bytes (BF16 gradients) -allreduce_time_7b = calc_ring_allreduce_time( - message_bytes=grad_bytes_7b, - n_gpus=256, - bandwidth_bytes_s=INFINIBAND_NDR_BW_GBS * 1e9, - latency_s=IB_NDR_LATENCY_US * 1e-6 -) + # ── SCENARIO 1: Data parallelism, large model ────────────────────────── + # 7B model, 256 GPUs, IB NDR + grad_bytes_7b = 7e9 * 2 # 7B params * 2 bytes (BF16 gradients) + allreduce_time_7b = calc_ring_allreduce_time( + message_bytes=grad_bytes_7b, + n_gpus=256, + bandwidth_bytes_s=INFINIBAND_NDR_BW_GBS * 1e9, + latency_s=IB_NDR_LATENCY_US * 1e-6 + ) # Quantity[second] -# Computation time: assume ~50ms forward+backward per step -comp_time_7b = 0.050 # 50 ms -rho_7b = allreduce_time_7b / comp_time_7b + comp_time_7b = 0.050 # 50 ms (seconds) + rho_7b = allreduce_time_7b.m_as(ureg.second) / comp_time_7b -# ── SCENARIO 2: Data parallelism, small model ────────────────────────────── -# 350M model, 256 GPUs, IB NDR -grad_bytes_350m = 350e6 * 2 # 350M params * 2 bytes -allreduce_time_350m = calc_ring_allreduce_time( - message_bytes=grad_bytes_350m, - n_gpus=256, - bandwidth_bytes_s=INFINIBAND_NDR_BW_GBS * 1e9, - latency_s=IB_NDR_LATENCY_US * 1e-6 -) -comp_time_350m = 0.005 # 5 ms (smaller model) -rho_350m = allreduce_time_350m / comp_time_350m + # ── SCENARIO 2: Data parallelism, small model ────────────────────────── + # 350M model, 256 GPUs, IB NDR + grad_bytes_350m = 350e6 * 2 # 350M params * 2 bytes + allreduce_time_350m = calc_ring_allreduce_time( + message_bytes=grad_bytes_350m, + n_gpus=256, + bandwidth_bytes_s=INFINIBAND_NDR_BW_GBS * 1e9, + latency_s=IB_NDR_LATENCY_US * 1e-6 + ) # Quantity[second] + comp_time_350m = 0.005 # 5 ms (seconds, smaller model) + rho_350m = allreduce_time_350m.m_as(ureg.second) / comp_time_350m -# ── SCENARIO 3: Tensor parallelism, within node ──────────────────────────── -# Activation transfer: 8 GPUs, NVLink, ~16 MB per layer -act_bytes = 16e6 # 16 MB -act_transfer_time = act_bytes / (NVLINK_H100_BW.to(GB / second).magnitude * 1e9) -comp_time_layer = 0.001 # 1 ms per layer -rho_tp = act_transfer_time / comp_time_layer + # ── SCENARIO 3: Tensor parallelism, within node ──────────────────────── + # Activation transfer: 8 GPUs, NVLink, ~16 MB per layer + act_bytes = 16e6 # 16 MB + act_transfer_time = act_bytes / (NVLINK_H100_BW.m_as(GB / second) * 1e9) + comp_time_layer = 0.001 # 1 ms per layer + rho_tp = act_transfer_time / comp_time_layer -# ── INVARIANTS ────────────────────────────────────────────────────────────── -check(rho_7b > 0.1, "7B comm ratio must be non-trivial") -check(rho_350m > 0.01, "350M comm ratio must be non-trivial") + # ── INVARIANTS ────────────────────────────────────────────────────────── + check(rho_7b > 0.1, "7B comm ratio must be non-trivial") + check(rho_350m > 0.01, "350M comm ratio must be non-trivial") -# ── OUTPUTS ───────────────────────────────────────────────────────────────── -ar_7b_ms_str = fmt(allreduce_time_7b * 1000, precision=1, commas=False) -rho_7b_str = fmt(rho_7b, precision=2, commas=False) + # ── OUTPUTS ───────────────────────────────────────────────────────────── + ar_7b_ms_str = fmt(allreduce_time_7b.m_as(ureg.millisecond), precision=1, commas=False) + rho_7b_str = fmt(rho_7b, precision=2, commas=False) + ar_350m_ms_str = fmt(allreduce_time_350m.m_as(ureg.millisecond), precision=1, commas=False) + rho_350m_str = fmt(rho_350m, precision=1, commas=False) + rho_tp_str = fmt(rho_tp, precision=3, commas=False) -ar_350m_ms_str = fmt(allreduce_time_350m * 1000, precision=1, commas=False) -rho_350m_str = fmt(rho_350m, precision=1, commas=False) - -rho_tp_str = fmt(rho_tp, precision=3, commas=False) +# ┌── EXPORTS (Bridge to Text) ───────────────────────────────────────────────── +ar_7b_ms_str = FleetCommCompRatio.ar_7b_ms_str +rho_7b_str = FleetCommCompRatio.rho_7b_str +rho_7b = FleetCommCompRatio.rho_7b # raw float used in fmt() call in prose +ar_350m_ms_str = FleetCommCompRatio.ar_350m_ms_str +rho_350m_str = FleetCommCompRatio.rho_350m_str +rho_tp_str = FleetCommCompRatio.rho_tp_str ``` @tbl-fleet-comm-comp shows the ratio for three representative scenarios. The contrast between them reveals why parallelism strategy must match the workload. @@ -685,12 +705,8 @@ The key insight for fleet-scale ML is that weak scaling is not just a mathematic ```{python} #| label: fleet-effective-flops #| echo: false - -# ============================================================================= -# PURPOSE -# ============================================================================= -# Purpose: Compute effective FLOPS for the compound loss example. -# Used in: Effective FLOPS worked example. +# Goal: Format peak and effective FLOPS for the 1,024-GPU compound loss callout. +# Exports: peak_str, eff_str, eff_pct_str, goodput_pct_str, mfu_pct_str, scaling_pct_str peak_str = fmt(FF.peak_1024, precision=0) eff_str = fmt(FF.eff_flops_1024, precision=0) diff --git a/book/quarto/contents/vol2/backmatter/appendix_reliability.qmd b/book/quarto/contents/vol2/backmatter/appendix_reliability.qmd index 2b3c159df..970d55e24 100644 --- a/book/quarto/contents/vol2/backmatter/appendix_reliability.qmd +++ b/book/quarto/contents/vol2/backmatter/appendix_reliability.qmd @@ -35,6 +35,28 @@ This appendix is designed as a *reference*. Use it when you need to move from in ```{python} #| label: appendix-reliability-setup #| echo: false +# ┌───────────────────────────────────────────────────────────────────────────── +# │ RELIABILITY FOUNDATIONS — MASTER COMPUTATION +# ├───────────────────────────────────────────────────────────────────────────── +# │ Context: PERSISTENT — All values used throughout the Reliability Foundations +# │ appendix: @tbl-component-fit, @tbl-mtbf-cluster, @tbl-failure-prob, +# │ @tbl-checkpoint-size, @tbl-recovery-anatomy, @tbl-strategy-comparison, +# │ @tbl-availability-stacking, and all Young-Daly worked examples. +# │ +# │ Goal: Provide all reliability constants — FIT rates, MTBF cascade, Young-Daly +# │ optimal checkpoint interval, recovery anatomy, and availability stacking — +# │ for the "Failure as a Physical Constraint" reference appendix. +# │ Show: See individual section cells for formatted values. This cell provides +# │ the physics; formatting cells and f-strings convert to display strings. +# │ How: pint Quantities from mlsys.constants; calc_mtbf_node, calc_mtbf_cluster, +# │ calc_young_daly_interval, calc_failure_probability, calc_checkpoint_size, +# │ calc_availability_stacked from formulas.py; all extractions via .m_as(). +# │ +# │ Imports: mlsys.constants (*), mlsys.formulas (calc_mtbf_*, calc_young_daly_interval, +# │ calc_failure_probability, calc_checkpoint_size, calc_availability_stacked) +# │ mlsys.formatting (fmt, check) +# │ Exports: R = ReliabilityFoundations (accessed as R.attribute in downstream cells) +# └───────────────────────────────────────────────────────────────────────────── from mlsys.constants import * from mlsys.formatting import fmt, check @@ -103,8 +125,9 @@ class ReliabilityFoundations: @classmethod def p_failure(cls, n_gpus, duration_hours): - mtbf_h = cls.cluster_mtbf(n_gpus) - return calc_failure_probability(mtbf_h, duration_hours) + mtbf_h = cls.cluster_mtbf(n_gpus) # Quantity[hour] + dur_h = duration_hours * ureg.hour # attach unit + return calc_failure_probability(mtbf_h, dur_h) # ┌── 5. CHECKPOINT SIZING ──────────────────────────────────────── # Mixed-precision Adam: 16 bytes/param @@ -114,25 +137,28 @@ class ReliabilityFoundations: @classmethod def ckpt_size_gb(cls, n_params): - return calc_checkpoint_size(n_params, cls.bytes_per_param) / 1e9 + return calc_checkpoint_size(n_params, cls.bytes_per_param).m_as(GB) # ┌── 6. YOUNG-DALY (10K cluster, 175B model) ──────────────────── - ckpt_175b_bytes = calc_checkpoint_size(175e9, 16) - ckpt_175b_gb = ckpt_175b_bytes / 1e9 - ckpt_write_bw = CHECKPOINT_WRITE_BW_GBS # GB/s - ckpt_write_time_s = ckpt_175b_gb / ckpt_write_bw + ckpt_175b_bytes = calc_checkpoint_size(175e9, 16) # Quantity[byte] + ckpt_175b_gb = ckpt_175b_bytes.m_as(GB) # raw float in GB + ckpt_write_bw = CHECKPOINT_WRITE_BW_GBS # GB/s (raw float) + ckpt_write_time_s = ckpt_175b_gb / ckpt_write_bw # raw float (seconds) - cluster_mtbf_10k_s = cluster_mtbf_10k * SEC_PER_HOUR - tau_opt_s = calc_young_daly_interval(ckpt_write_time_s, cluster_mtbf_10k_s) - tau_opt_min = tau_opt_s / SECONDS_PER_MINUTE + cluster_mtbf_10k_s = cluster_mtbf_10k.m_as(ureg.second) # raw float (seconds) + tau_opt_s = calc_young_daly_interval(ckpt_write_time_s, cluster_mtbf_10k_s) # Quantity[second] + tau_opt_min = tau_opt_s.m_as(ureg.minute) # raw float in minutes # ┌── 7. RECOVERY TIME ─────────────────────────────────────────── - t_detect = HEARTBEAT_TIMEOUT_S - t_reschedule = RESCHEDULE_TIME_S - t_reload_s = ckpt_write_time_s # same BW, same size + t_detect = HEARTBEAT_TIMEOUT_S # raw float (seconds) — kept for table display + t_reschedule = RESCHEDULE_TIME_S # raw float (seconds) — kept for table display + t_reload_s = ckpt_write_time_s # raw float (seconds) # Replay: half the interval on average - t_replay_s = tau_opt_s / 2 - t_recovery_total_s = t_detect + t_reschedule + t_reload_s + t_replay_s + t_replay_s = tau_opt_s / 2 # Quantity[second] + # Sum: attach units to raw seconds, then extract in minutes + t_recovery_total_s = ( + (t_detect + t_reschedule + t_reload_s) * ureg.second + t_replay_s + ).m_as(ureg.minute) # raw float in minutes # ┌── 8. GOODPUT ───────────────────────────────────────────────── overhead_ckpt = OVERHEAD_CHECKPOINT @@ -150,8 +176,8 @@ class ReliabilityFoundations: R = ReliabilityFoundations # short alias for inline use # ┌── INVARIANTS ────────────────────────────────────────────────────── -check(R.cluster_mtbf_10k < 5.0, - f"10K cluster MTBF should be < 5 hours, got {R.cluster_mtbf_10k:.2f}") +check(R.cluster_mtbf_10k.m_as(ureg.hour) < 5.0, + f"10K cluster MTBF should be < 5 hours, got {R.cluster_mtbf_10k.m_as(ureg.hour):.2f}") check(R.tau_opt_min > 5 and R.tau_opt_min < 60, f"Young-Daly interval should be 5-60 min, got {R.tau_opt_min:.1f}") check(R.p_failure(10_000, 24) > 0.99, @@ -159,12 +185,12 @@ check(R.p_failure(10_000, 24) > 0.99, # ┌── FORMATTED OUTPUTS ────────────────────────────────────────────── gpu_mttf_str = fmt(R.gpu_mttf, precision=0) -node_mtbf_str = fmt(R.node_mtbf, precision=0) -cluster_mtbf_10k_str = fmt(R.cluster_mtbf_10k, precision=2) +node_mtbf_str = fmt(R.node_mtbf.m_as(ureg.hour), precision=0) +cluster_mtbf_10k_str = fmt(R.cluster_mtbf_10k.m_as(ureg.hour), precision=2) tau_opt_min_str = fmt(R.tau_opt_min, precision=1) ckpt_175b_gb_str = fmt(R.ckpt_175b_gb, precision=0) ckpt_write_time_str = fmt(R.ckpt_write_time_s, precision=1) -t_recovery_str = fmt(R.t_recovery_total_s / SECONDS_PER_MINUTE, precision=1) +t_recovery_str = fmt(R.t_recovery_total_s, precision=1) ``` ## Failure Probability at Scale {#sec-reliability-foundations-failure-probability} @@ -188,8 +214,8 @@ $$ \text{MTTF} = \frac{10^9}{\text{FIT}} $$ {#eq-mttf-from-fit} ```{python} #| label: component-fit-table #| echo: false - -# Format component data for the table +# Goal: Format per-component MTTF in years for @tbl-component-fit. +# Exports: gpu_mttf_yr, hbm_mttf_yr, nic_mttf_yr, psu_mttf_yr, pcie_mttf_yr, cable_mttf_yr, tor_mttf_yr gpu_mttf_yr = f"{R.gpu_mttf / HOURS_PER_YEAR:.1f}" hbm_mttf_yr = f"{R.hbm_mttf / HOURS_PER_YEAR:.1f}" nic_mttf_yr = f"{R.nic_mttf / HOURS_PER_YEAR:.1f}" @@ -233,24 +259,24 @@ For a cluster of $N$ identical nodes, the same logic applies one level up: $$ \text{MTBF}_\text{cluster} = \frac{\text{MTBF}_\text{node}}{N} $$ {#eq-mtbf-cluster} -This is the **MTBF cascade**: reliability degrades linearly with component count at each level, and the levels compound. A node with `{python} f"{R.node_mtbf:,.0f}"`-hour MTBF sounds reliable. A cluster of `{python} f"{R.nodes_10k:,}"` such nodes has an MTBF of just `{python} cluster_mtbf_10k_str` hours---a failure every few hours is the expected steady state. +This is the **MTBF cascade**: reliability degrades linearly with component count at each level, and the levels compound. A node with `{python} f"{R.node_mtbf.m_as(ureg.hour):,.0f}"`-hour MTBF sounds reliable. A cluster of `{python} f"{R.nodes_10k:,}"` such nodes has an MTBF of just `{python} cluster_mtbf_10k_str` hours---a failure every few hours is the expected steady state. @tbl-mtbf-cluster shows how cluster MTBF shrinks as fleet size grows. ```{python} #| label: mtbf-cluster-table #| echo: false - -# Build MTBF table data +# Goal: Build MTBF row data (hours or minutes, failures/day) for @tbl-mtbf-cluster. +# Exports: mtbf_data list of dicts with "gpus", "nodes", "mtbf", "per_day" keys mtbf_data = [] for n_gpus in R.cluster_sizes: n_nodes = R.nodes_for_gpus(n_gpus) - mtbf_h = R.cluster_mtbf(n_gpus) - if mtbf_h >= 1.0: - mtbf_str = f"{mtbf_h:.1f} hours" + mtbf_h_val = R.cluster_mtbf(n_gpus).m_as(ureg.hour) # raw float in hours + if mtbf_h_val >= 1.0: + mtbf_str = f"{mtbf_h_val:.1f} hours" else: - mtbf_str = f"{mtbf_h * SECONDS_PER_MINUTE:.0f} minutes" - per_day = 24 / mtbf_h + mtbf_str = f"{mtbf_h_val * 60:.0f} minutes" + per_day = 24 / mtbf_h_val mtbf_data.append({ "gpus": f"{n_gpus:,}", "nodes": f"{n_nodes:,}", @@ -292,8 +318,8 @@ When $T_\text{job} \gg \text{MTBF}$, this probability approaches 1 rapidly. @tbl ```{python} #| label: failure-probability-table #| echo: false - -# Build failure probability matrix +# Goal: Compute P(≥1 failure) matrix for @tbl-failure-prob across cluster sizes and job durations. +# Exports: fp_data dict keyed by n_gpus; values are [1-day, 1-week, 30-day] probability strings dur_labels = ["1 Day", "1 Week", "30 Days"] fp_data = {} for n_gpus in R.cluster_sizes: @@ -370,6 +396,8 @@ $$ \text{Checkpoint Size} = N_\text{params} \times 16 \text{ bytes/param} $$ {#e ```{python} #| label: checkpoint-sizing-table #| echo: false +# Goal: Format checkpoint sizes and write times for @tbl-checkpoint-size across 7B–1T models. +# Exports: ckpt_data list of dicts with "label", "ckpt_gb", "write_time" keys ckpt_data = [] for i, n_params in enumerate(R.model_sizes_params): @@ -407,28 +435,50 @@ At frontier scale (175B+ parameters), checkpoint sizes reach the terabyte range. ```{python} #| label: worked-example-young-daly #| echo: false +# ┌───────────────────────────────────────────────────────────────────────────── +# │ YOUNG-DALY WORKED EXAMPLE +# ├───────────────────────────────────────────────────────────────────────────── +# │ Context: @sec-reliability-foundations-worked-example callout +# │ +# │ Goal: Compute optimal checkpoint interval τ_opt for 175B model on 10K-GPU cluster; +# │ show scaling to 20K GPUs. +# │ Show: ~28 min optimal interval, ~X% checkpoint overhead, shorter interval at 20K GPUs. +# │ How: calc_young_daly_interval(δ, MTBF_s) from R.ckpt_write_time_s and R.cluster_mtbf_10k_s. +# │ +# │ Imports: mlsys.formulas (calc_young_daly_interval), mlsys.constants (GPUS_PER_HOST) +# │ Exports: yd_mtbf_h_str, yd_delta_str, yd_tau_min_str, yd_overhead_str, tau_20k_min_str +# └───────────────────────────────────────────────────────────────────────────── -# All values already computed in ReliabilityFoundations -yd_mtbf_h = R.cluster_mtbf_10k -yd_mtbf_s = R.cluster_mtbf_10k_s -yd_delta = R.ckpt_write_time_s -yd_tau_s = R.tau_opt_s -yd_tau_min = R.tau_opt_min +class WorkedExampleYoungDaly: + """Young-Daly optimal checkpoint interval for 175B model on 10K-GPU cluster.""" + # All values already computed in ReliabilityFoundations + yd_mtbf_h = R.cluster_mtbf_10k # Quantity[hour] + yd_mtbf_s = R.cluster_mtbf_10k_s # raw float (seconds) + yd_delta = R.ckpt_write_time_s # raw float (seconds) + yd_tau_s = R.tau_opt_s # Quantity[second] + yd_tau_min = R.tau_opt_min # raw float in minutes -# Overhead from checkpointing alone -yd_ckpt_overhead = (yd_delta / yd_tau_s) * 100 + # Overhead from checkpointing alone + yd_ckpt_overhead = (yd_delta / yd_tau_s.m_as(ureg.second)) * 100 -# What if MTBF halves (20K GPUs)? -mtbf_20k_h = R.node_mtbf / (20_000 // GPUS_PER_HOST) -mtbf_20k_s = mtbf_20k_h * SEC_PER_HOUR -tau_20k_s = calc_young_daly_interval(yd_delta, mtbf_20k_s) -tau_20k_min = tau_20k_s / SECONDS_PER_MINUTE + # What if MTBF halves (20K GPUs)? + mtbf_20k_h = R.node_mtbf / (20_000 // GPUS_PER_HOST) # Quantity[hour] + mtbf_20k_s = mtbf_20k_h.m_as(ureg.second) # raw float (seconds) + tau_20k_s = calc_young_daly_interval(yd_delta, mtbf_20k_s) # Quantity[second] + tau_20k_min = tau_20k_s.m_as(ureg.minute) # raw float in minutes -yd_mtbf_h_str = fmt(yd_mtbf_h, precision=2) -yd_delta_str = fmt(yd_delta, precision=1) -yd_tau_min_str = fmt(yd_tau_min, precision=1) -yd_overhead_str = fmt(yd_ckpt_overhead, precision=1) -tau_20k_min_str = fmt(tau_20k_min, precision=1) + yd_mtbf_h_str = fmt(yd_mtbf_h.m_as(ureg.hour), precision=2) + yd_delta_str = fmt(yd_delta, precision=1) + yd_tau_min_str = fmt(yd_tau_min, precision=1) + yd_overhead_str = fmt(yd_ckpt_overhead, precision=1) + tau_20k_min_str = fmt(tau_20k_min, precision=1) + +# ┌── EXPORTS (Bridge to Text) ───────────────────────────────────────────────── +yd_mtbf_h_str = WorkedExampleYoungDaly.yd_mtbf_h_str +yd_delta_str = WorkedExampleYoungDaly.yd_delta_str +yd_tau_min_str = WorkedExampleYoungDaly.yd_tau_min_str +yd_overhead_str = WorkedExampleYoungDaly.yd_overhead_str +tau_20k_min_str = WorkedExampleYoungDaly.tau_20k_min_str ``` ::: {.callout-example title="Young-Daly: 175B Model on a 10,000-GPU Cluster"} @@ -470,12 +520,14 @@ $$ T_\text{recovery} = T_\text{detect} + T_\text{reschedule} + T_\text{reload} + ```{python} #| label: recovery-anatomy-table #| echo: false +# Goal: Format recovery phase durations for @tbl-recovery-anatomy. +# Exports: t_detect_str, t_reschedule_str, t_reload_str, t_replay_str, t_total_str t_detect_str = f"{R.t_detect}" t_reschedule_str = f"{R.t_reschedule}" t_reload_str = fmt(R.t_reload_s, precision=1) -t_replay_str = fmt(R.t_replay_s / SECONDS_PER_MINUTE, precision=1) -t_total_str = fmt(R.t_recovery_total_s / SECONDS_PER_MINUTE, precision=1) +t_replay_str = fmt(R.t_replay_s.m_as(ureg.minute), precision=1) +t_total_str = fmt(R.t_recovery_total_s, precision=1) ``` +----------------------------+---------------------------+-------------------------------------------------+ @@ -567,6 +619,8 @@ where $A$ is the availability of a single replica and $k$ is the number of repli ```{python} #| label: availability-stacking-table #| echo: false +# Goal: Format availability, nines count, and annual downtime for @tbl-availability-stacking. +# Exports: avail_data list of dicts with "k", "avail", "nines", "downtime" keys avail_data = [] for k in R.avail_replicas: diff --git a/book/quarto/contents/vol2/data_storage/data_storage.qmd b/book/quarto/contents/vol2/data_storage/data_storage.qmd index da42138f9..15986bc4a 100644 --- a/book/quarto/contents/vol2/data_storage/data_storage.qmd +++ b/book/quarto/contents/vol2/data_storage/data_storage.qmd @@ -27,7 +27,8 @@ from mlsys.constants import ( CLOUD_EGRESS_PER_GB, USD, STORAGE_COST_S3_STD, STORAGE_COST_GLACIER, STORAGE_COST_NVME_LOW, STORAGE_COST_NVME_HIGH, - Mparam, Bparam, TFLOPs, GFLOPs + Mparam, Bparam, TFLOPs, GFLOPs, + watt ) from mlsys.formatting import fmt, sci, check @@ -77,13 +78,25 @@ Accelerators can compute faster than storage can feed them. A modern GPU process # ┌───────────────────────────────────────────────────────────────────────────── # │ STORAGE HIERARCHY AND MODEL SPECIFICATIONS # ├───────────────────────────────────────────────────────────────────────────── -# │ Context: Used across the chapter for hierarchy tables and bottleneck analysis. +# │ Context: @sec-data-storage storage hierarchy tables and I/O bottleneck +# │ analysis paragraphs throughout the chapter. # │ -# │ Goal: Provide quantitative specs for hardware and lighthouse models. -# │ Show: The massive gap between HBM bandwidth and disk I/O. +# │ Goal: Establish the six-tier storage hierarchy gap by computing H100 HBM +# │ bandwidth (H100_MEM_BW) vs NVMe sequential bandwidth (NVME_SEQUENTIAL_BW), +# │ and estimate GPT-3 checkpoint write time (GPT3_PARAMS, FP16, at NVMe +# │ vs network storage) to show the I/O bottleneck in fault tolerance. +# │ Show: "3.35" TB/s H100 HBM vs "~7" GB/s NVMe — inline in the storage +# │ hierarchy tier comparison and checkpoint I/O bottleneck paragraphs. +# │ How: Direct .m_as() for each unit conversion; H100_TDP .m_as(watt). # │ -# │ Imports: mlsys.constants -# │ Exports: a100_mem, h100_bw_tbs, gpt3_params_b, resnet_params_m, etc. +# │ Imports: mlsys.constants (A100_MEM_CAPACITY, H100_MEM_CAPACITY, H100_MEM_BW, +# │ H100_FLOPS_FP8_TENSOR, H100_FLOPS_FP16_TENSOR, H100_TDP, +# │ GPT3_PARAMS, RESNET50_PARAMS, NVME_SEQUENTIAL_BW, +# │ NVLINK_H100_BW, PCIE_GEN5_BW, GiB, TB, TFLOPs, GB, second, +# │ watt, Bparam, Mparam) +# │ Exports: a100_mem, h100_mem, h100_bw_tbs, h100_fp8_tflops, h100_fp16_tflops, +# │ h100_tdp_w, gpt3_params_b, resnet_params_m, nvme_bw, +# │ nvlink_bw_gbs, pcie5_bw_gbs # └───────────────────────────────────────────────────────────────────────────── import math @@ -93,21 +106,21 @@ class StorageSetup: Namespace for global storage constants and specs. """ # GPU specs - a100_mem = A100_MEM_CAPACITY.to(GiB).magnitude - h100_mem = H100_MEM_CAPACITY.to(GiB).magnitude - h100_bw = H100_MEM_BW.to(TB/second).magnitude - h100_fp8 = H100_FLOPS_FP8_TENSOR.to(TFLOPs/second).magnitude - h100_fp16 = H100_FLOPS_FP16_TENSOR.to(TFLOPs/second).magnitude - h100_tdp = H100_TDP.magnitude + a100_mem = A100_MEM_CAPACITY.m_as(GiB) + h100_mem = H100_MEM_CAPACITY.m_as(GiB) + h100_bw = H100_MEM_BW.m_as(TB/second) + h100_fp8 = H100_FLOPS_FP8_TENSOR.m_as(TFLOPs/second) + h100_fp16 = H100_FLOPS_FP16_TENSOR.m_as(TFLOPs/second) + h100_tdp = H100_TDP.m_as(watt) # Model specs - gpt3_params = GPT3_PARAMS.to(Bparam).magnitude - resnet_params = RESNET50_PARAMS.to(Mparam).magnitude + gpt3_params = GPT3_PARAMS.m_as(Bparam) + resnet_params = RESNET50_PARAMS.m_as(Mparam) # Storage & Interconnect - nvme_bw = NVME_SEQUENTIAL_BW.to(GB/second).magnitude - nvlink_bw = NVLINK_H100_BW.to(GB/second).magnitude - pcie5_bw = PCIE_GEN5_BW.to(GB/second).magnitude + nvme_bw = NVME_SEQUENTIAL_BW.m_as(GB/second) + nvlink_bw = NVLINK_H100_BW.m_as(GB/second) + pcie5_bw = PCIE_GEN5_BW.m_as(GB/second) # ┌── EXPORTS (Bridge to Text) ───────────────────────────────────────────────── a100_mem = f"{StorageSetup.a100_mem:.0f}" @@ -125,11 +138,11 @@ nvlink_bw_gbs = f"{StorageSetup.nvlink_bw:.0f}" pcie5_bw_gbs = f"{StorageSetup.pcie5_bw:.0f}" # Storage -nvme_bw = f"{NVME_SEQUENTIAL_BW.to(GB/second).magnitude:.1f}" +nvme_bw = f"{NVME_SEQUENTIAL_BW.m_as(GB/second):.1f}" # Interconnect -nvlink_bw_gbs = f"{NVLINK_H100_BW.to(GB/second).magnitude:.0f}" -pcie5_bw_gbs = f"{PCIE_GEN5_BW.to(GB/second).magnitude:.0f}" +nvlink_bw_gbs = f"{NVLINK_H100_BW.m_as(GB/second):.0f}" +pcie5_bw_gbs = f"{PCIE_GEN5_BW.m_as(GB/second):.0f}" # ┌── P.I.C.O. ISOLATED SCENARIO ─────────────────────────────────────────────── class StorageEconomics: diff --git a/book/quarto/contents/vol2/fault_tolerance/fault_tolerance.qmd b/book/quarto/contents/vol2/fault_tolerance/fault_tolerance.qmd index 6c9049e2b..98a48987f 100644 --- a/book/quarto/contents/vol2/fault_tolerance/fault_tolerance.qmd +++ b/book/quarto/contents/vol2/fault_tolerance/fault_tolerance.qmd @@ -40,25 +40,66 @@ A single GPU fails perhaps once per year. A thousand GPUs experience failures da ::: ```{python} -#| label: fault-tolerance-setup #| echo: false +#| label: fault-tolerance-setup +# ┌───────────────────────────────────────────────────────────────────────────── +# │ FAULT TOLERANCE CHAPTER SETUP +# ├───────────────────────────────────────────────────────────────────────────── +# │ Context: Chapter-wide registry — values used in §Young-Daly Law +# │ (@eq-young-daly-applied, line ~1957), §Sharded Checkpointing (line ~2289), +# │ and §Recovery Cost (line ~2365). +# │ +# │ Goal: Pre-compute GPT-3 checkpoint size (weights + Adam states) and +# │ per-worker shard size for 1000-worker training, motivating the +# │ checkpoint-interval formula and distributed checkpoint design. +# │ Show: gpt3_ckpt_tb="2.1" TB (full checkpoint), +# │ gpt3_shard_gb="2.1" GB (per-worker shard at 1000 workers) — inline in prose. +# │ How: Multiply GPT3_PARAMS.m_as(param) by bytes-per-param for each state; +# │ convert result pint Quantity with .m_as(TB) and .m_as(GB). +# │ +# │ Imports: mlsys.constants (GPT3_PARAMS, param, byte, TB, GB, BILLION), +# │ mlsys.formatting (fmt, sci) +# │ Exports: gpt3_params_b, gpt3_ckpt_tb, gpt3_adam_tb, gpt3_shard_gb +# │ Note: PERSISTENT — gpt3_ckpt_tb used in §Young-Daly (line ~1957), +# │ §Sharded Checkpointing (line ~2289), §Recovery (line ~2365, ~2385); +# │ gpt3_shard_gb used in §Sharded Checkpointing (line ~2289), §Recovery (~2371, ~2385). +# └───────────────────────────────────────────────────────────────────────────── from mlsys.constants import * from mlsys.formatting import fmt, sci -# GPT-3 model parameters -gpt3_params_b = f"{GPT3_PARAMS.to(param).magnitude / BILLION:.0f}" +# ┌── P.I.C.O. ISOLATED SCENARIO ─────────────────────────────────────────────── +class FaultToleranceSetup: + """Namespace for GPT-3 checkpoint sizing and shard calculations.""" -# GPT-3 checkpoint size: weights (2 bytes FP16) + Adam m (4 bytes) + Adam v (4 bytes) = 12 bytes/param -gpt3_ckpt_bytes = GPT3_PARAMS.magnitude * 12 * byte -gpt3_ckpt_tb = f"{gpt3_ckpt_bytes.to(TB).magnitude:.1f}" + # ┌── 1. PARAMETERS (Inputs) ────────────────────────────────────────────── + # GPT-3 checkpoint byte layout: + # weights (2 bytes FP16) + Adam m (4 bytes) + Adam v (4 bytes) = 12 bytes/param + bytes_full_ckpt = 12 # bytes per param: weights + Adam m + v + bytes_adam_only = 8 # bytes per param: Adam m + v only + n_workers = 1000 # workers for shard size calculation -# GPT-3 Adam optimizer state: m + v = 8 bytes/param -gpt3_adam_bytes = GPT3_PARAMS.magnitude * 8 * byte -gpt3_adam_tb = f"{gpt3_adam_bytes.to(TB).magnitude:.1f}" + # ┌── 2. CALCULATION (The Physics) ──────────────────────────────────────── + # Full checkpoint: weights + optimizer states + gpt3_ckpt_bytes = GPT3_PARAMS.m_as(param) * bytes_full_ckpt * byte -# Per-worker shard for 1000 workers -gpt3_shard_gb = f"{gpt3_ckpt_bytes.to(GB).magnitude / 1000:.1f}" + # Optimizer-only checkpoint: Adam m + v (no weights) + gpt3_adam_bytes = GPT3_PARAMS.m_as(param) * bytes_adam_only * byte + + # ┌── 3. INVARIANTS (Guardrails) ────────────────────────────────────────── + # No check() calls needed — values are monotone functions of constants. + + # ┌── 4. OUTPUTS (Formatting) ───────────────────────────────────────────── + gpt3_params_b = f"{GPT3_PARAMS.m_as(param) / BILLION:.0f}" + gpt3_ckpt_tb = f"{gpt3_ckpt_bytes.m_as(TB):.1f}" + gpt3_adam_tb = f"{gpt3_adam_bytes.m_as(TB):.1f}" + gpt3_shard_gb = f"{gpt3_ckpt_bytes.m_as(GB) / n_workers:.1f}" + +# ┌── EXPORTS (Bridge to Text) ───────────────────────────────────────────────── +gpt3_params_b = FaultToleranceSetup.gpt3_params_b +gpt3_ckpt_tb = FaultToleranceSetup.gpt3_ckpt_tb +gpt3_adam_tb = FaultToleranceSetup.gpt3_adam_tb +gpt3_shard_gb = FaultToleranceSetup.gpt3_shard_gb ``` ## Failure Analysis at Scale {#sec-fault-tolerance-reliability-reliability-failure-analysis-scale-6b4b} @@ -2123,45 +2164,88 @@ Imagine 10,000 GPUs, each holding a 10 GB shard of the model state, simultaneous While @tbl-checkpoint-overhead-by-model suggests modest overhead percentages, real deployments often encounter checkpoint times far exceeding these theoretical estimates. Diagnosing such discrepancies requires examining the full system stack. ```{python} -#| label: checkpoint-debug-calc #| echo: false +#| label: checkpoint-debug-calc +# ┌───────────────────────────────────────────────────────────────────────────── +# │ CHECKPOINT DEBUG CALCULATION +# ├───────────────────────────────────────────────────────────────────────────── +# │ Context: "Debugging Checkpoint Overhead" callout in §Checkpoint Overhead. +# │ +# │ Goal: Diagnose why a 70B model checkpoint takes 10 minutes instead of +# │ 2 minutes on an NFS-backed cluster, by computing theoretical bandwidth +# │ limits and contention-induced effective throughput per node. +# │ Show: total_ckpt_gb_str="420" GB, nfs_gbs_str="1.25" GB/s, +# │ min_write_min_str="5.6" min, per_node_mbs_str="20" MB/s, +# │ serialized_min_str="5,600" min — inline in the Fleet Stack diagnosis. +# │ How: Compute weights + optimizer state size in GB; derive NFS bandwidth in +# │ GB/s (10 Gbps / 8); calculate min write time and per-node bandwidth +# │ under contention from 64 concurrent nodes. +# │ +# │ Imports: (none — pure Python arithmetic, no pint quantities) +# │ Exports: weights_gb_str, optimizer_gb_str, total_ckpt_gb_str, nfs_gbs_str, +# │ min_write_s_str, min_write_min_str, per_node_mbs_str, serialized_min_str, +# │ extended_weeks_str, extra_cost_k_str +# └───────────────────────────────────────────────────────────────────────────── -# 70B model checkpoint sizing -model_params_b = 70 # billions -bytes_per_param = 2 # BF16 -weights_gb = model_params_b * bytes_per_param # 140 GB -optimizer_gb = weights_gb * 2 # Adam first + second moments -total_ckpt_gb = weights_gb + optimizer_gb # 420 GB +class CheckpointDebugCalc: + """Diagnose 70B checkpoint overhead on NFS-backed cluster.""" -# Storage constraints -nfs_gbps = 10 # Gbps network -nfs_gbs = nfs_gbps / 8 # 1.25 GB/s -min_write_s = total_ckpt_gb / nfs_gbs # seconds -min_write_min = min_write_s / 60 # minutes + # ┌── 1. PARAMETERS (Inputs) ────────────────────────────────────────────── + model_params_b = 70 # 70B parameter model + bytes_per_param = 2 # BF16 weights + nfs_gbps = 10 # NFS network attachment bandwidth in Gbps + n_nodes = 64 # nodes writing simultaneously + overhead_pct = 30 # observed training throughput loss % + base_weeks = 2 # baseline training duration (weeks) + extra_cost_k = 500 # additional cost from extended training ($K) -# Contention analysis -n_nodes = 64 -per_node_gbs = nfs_gbs / n_nodes # GB/s per node -per_node_mbs = per_node_gbs * 1000 # MB/s per node -serialized_min = (total_ckpt_gb / per_node_gbs) / 60 + # ┌── 2. CALCULATION (The Physics) ──────────────────────────────────────── + # Model state sizing + weights_gb = model_params_b * bytes_per_param # 140 GB + optimizer_gb = weights_gb * 2 # Adam m + v moments + total_ckpt_gb = weights_gb + optimizer_gb # 420 GB -# Training extension -overhead_pct = 30 -base_weeks = 2 -extended_weeks = base_weeks * (1 + overhead_pct / 100) -extra_cost_k = 500 # $K + # Storage bandwidth limits + nfs_gbs = nfs_gbps / 8 # 1.25 GB/s + min_write_s = total_ckpt_gb / nfs_gbs # theoretical minimum seconds + min_write_min = min_write_s / 60 # convert to minutes -# Format strings -weights_gb_str = f"{weights_gb:.0f}" -optimizer_gb_str = f"{optimizer_gb:.0f}" -total_ckpt_gb_str = f"{total_ckpt_gb:.0f}" -nfs_gbs_str = f"{nfs_gbs}" -min_write_s_str = f"{min_write_s:.0f}" -min_write_min_str = f"{min_write_min:.1f}" -per_node_mbs_str = f"{per_node_mbs:.0f}" -serialized_min_str = f"{serialized_min:.0f}" -extended_weeks_str = f"{extended_weeks:.1f}" -extra_cost_k_str = f"{extra_cost_k}" + # Contention: 64 nodes sharing the NFS bandwidth + per_node_gbs = nfs_gbs / n_nodes # GB/s per node under contention + per_node_mbs = per_node_gbs * 1000 # MB/s per node + serialized_min = (total_ckpt_gb / per_node_gbs) / 60 # worst-case serialized write time + + # Training schedule impact + extended_weeks = base_weeks * (1 + overhead_pct / 100) + + # ┌── 3. INVARIANTS (Guardrails) ────────────────────────────────────────── + assert min_write_min < 10, "Theoretical minimum must be less than observed 10 minutes" + assert serialized_min > min_write_min, "Contention time must exceed theoretical minimum" + + # ┌── 4. OUTPUTS (Formatting) ───────────────────────────────────────────── + weights_gb_str = f"{weights_gb:.0f}" + optimizer_gb_str = f"{optimizer_gb:.0f}" + total_ckpt_gb_str = f"{total_ckpt_gb:.0f}" + nfs_gbs_str = f"{nfs_gbs}" + min_write_s_str = f"{min_write_s:.0f}" + min_write_min_str = f"{min_write_min:.1f}" + per_node_mbs_str = f"{per_node_mbs:.0f}" + serialized_min_str = f"{serialized_min:.0f}" + extended_weeks_str = f"{extended_weeks:.1f}" + extra_cost_k_str = f"{extra_cost_k}" + +# ┌── EXPORTS (Bridge to Text) ───────────────────────────────────────────────── +weights_gb_str = CheckpointDebugCalc.weights_gb_str +optimizer_gb_str = CheckpointDebugCalc.optimizer_gb_str +total_ckpt_gb_str = CheckpointDebugCalc.total_ckpt_gb_str +nfs_gbs_str = CheckpointDebugCalc.nfs_gbs_str +min_write_s_str = CheckpointDebugCalc.min_write_s_str +min_write_min_str = CheckpointDebugCalc.min_write_min_str +per_node_gbs = CheckpointDebugCalc.per_node_gbs +per_node_mbs_str = CheckpointDebugCalc.per_node_mbs_str +serialized_min_str = CheckpointDebugCalc.serialized_min_str +extended_weeks_str = CheckpointDebugCalc.extended_weeks_str +extra_cost_k_str = CheckpointDebugCalc.extra_cost_k_str ``` ::: {.callout-example title="Debugging Checkpoint Overhead"}