mirror of
https://github.com/harvard-edge/cs249r_book.git
synced 2026-03-09 07:15:51 -05:00
refactor: anchor Volume 2 Compute Infrastructure math to the Frontier Mission; standardize hardware twin naming and unify Vol 1 and Vol 2 logic
This commit is contained in:
@@ -22,7 +22,7 @@ from mlsys.registry import start_chapter
|
||||
from mlsys.constants import (
|
||||
GB, TB, PB, Gbps, byte, second, GB, MB, watt, hour, kilowatt,
|
||||
BILLION, TRILLION, SEC_PER_HOUR, SEC_PER_DAY, BITS_PER_BYTE, KIB_TO_BYTES,
|
||||
A100_MEM_CAPACITY, H100_MEM_CAPACITY, H100_MEM_BW, H100_TDP,
|
||||
A100_MEM_CAPACITY, system.ram, system.memory_bw, H100_TDP,
|
||||
A100_FLOPS_FP16_TENSOR, A100_MEM_BW, A100_TDP,
|
||||
B200_FLOPS_FP16_TENSOR, B200_MEM_BW, B200_TDP,
|
||||
V100_FLOPS_FP16_TENSOR, V100_MEM_BW, V100_TDP,
|
||||
@@ -35,7 +35,7 @@ from mlsys.constants import (
|
||||
WSE2_CORES, WSE2_MEM_CAPACITY, WSE2_MEM_BW, WSE2_TDP,
|
||||
WSE3_CORES, WSE3_MEM_CAPACITY, WSE3_MEM_BW, WSE3_TDP,
|
||||
TPUV5P_MEM_BW, NVLINK_H100_BW, INFINIBAND_NDR_BW, PCIE_GEN5_BW, SYSTEM_MEMORY_BW,
|
||||
CLOUD_ELECTRICITY_PER_KWH, USD, GPT3_PARAMS,
|
||||
CLOUD_ELECTRICITY_PER_KWH, USD, Models.GPT3.parameters,
|
||||
Mparam, Bparam, TFLOPs, flop, param, MFLOPs, GFLOPs
|
||||
)
|
||||
from mlsys.formatting import fmt, sci, check, md, md_math
|
||||
@@ -94,12 +94,12 @@ Machine learning systems have a physical reality that transcends code. While the
|
||||
# │ How: Pull from mlsys.constants; ridge via (flops/bw).to(flop/byte).m_as(flop/byte);
|
||||
# │ TCO from CLOUD_ELECTRICITY_PER_KWH + amortized CapEx vs cloud hourly rate.
|
||||
# │
|
||||
# │ Imports: mlsys.constants (H100_MEM_BW, H100_FLOPS_FP16_TENSOR,
|
||||
# │ H100_MEM_CAPACITY, H100_TDP, A100_FLOPS_FP16_TENSOR, A100_MEM_BW,
|
||||
# │ Imports: mlsys.constants (system.memory_bw, system.peak_flops,
|
||||
# │ system.ram, H100_TDP, A100_FLOPS_FP16_TENSOR, A100_MEM_BW,
|
||||
# │ A100_TDP, B200_FLOPS_FP16_TENSOR, B200_MEM_BW, B200_TDP,
|
||||
# │ V100_FLOPS_FP16_TENSOR, V100_MEM_BW, V100_TDP,
|
||||
# │ NVLINK_H100_BW, INFINIBAND_NDR_BW, PCIE_GEN5_BW, SYSTEM_MEMORY_BW,
|
||||
# │ CLOUD_ELECTRICITY_PER_KWH, GPT3_PARAMS, TPUV3_FLOPS_BF16,
|
||||
# │ CLOUD_ELECTRICITY_PER_KWH, Models.GPT3.parameters, TPUV3_FLOPS_BF16,
|
||||
# │ TPUV4_FLOPS_BF16, TPUV5P_FLOPS_BF16, WSE3_CORES, WSE3_MEM_CAPACITY)
|
||||
# │ Exports: h100_tflops, h100_bw, h100_ridge, ddr_bw_str, nvlink_bw_str,
|
||||
# │ pcie_bw_str, ib_bw_str, ib_bw_gbs, node_hbm_cap, rack_power_str,
|
||||
@@ -113,13 +113,13 @@ class TokenLatency:
|
||||
"""
|
||||
Scenario: The Physics of Token Latency (Memory Wall).
|
||||
Calculates time for loading weights vs performing compute.
|
||||
Matches Archetype A (GPT-4 / Llama-3) profile.
|
||||
Matches Archetype A (Frontier LLM) profile.
|
||||
"""
|
||||
# ┌── 1. LOAD (Constants) ───────────────────────────────────────────────
|
||||
p_params = LLAMA3_70B_PARAMS
|
||||
weight_bytes = BYTES_PER_FP16
|
||||
hbm_bw = H100_MEM_BW
|
||||
peak_tflops = H100_FLOPS_FP16_TENSOR
|
||||
p_params = Models.Llama2_70B.parameters
|
||||
weight_bytes = BYTES_FP16
|
||||
hbm_bw = system.memory_bw
|
||||
peak_tflops = system.peak_flops
|
||||
|
||||
# ┌── 2. EXECUTE (The Compute) ─────────────────────────────────────────
|
||||
weight_vol = p_params * weight_bytes
|
||||
@@ -177,18 +177,18 @@ In the Introduction to this volume (@sec-vol2-introduction), we defined the Mach
|
||||
# │
|
||||
# │ Goal: Provide GPT-3 scale statistics for infrastructure discussion.
|
||||
# │ Show: ~175B parameters.
|
||||
# │ How: pulling GPT3_PARAMS from mlsys.constants.
|
||||
# │ How: pulling Models.GPT3.parameters from mlsys.constants.
|
||||
# │
|
||||
# │ Imports: mlsys.constants (GPT3_PARAMS, param, BILLION)
|
||||
# │ Imports: mlsys.constants (Models.GPT3.parameters, param, BILLION)
|
||||
# │ Exports: gpt3_params_b
|
||||
# └─────────────────────────────────────────────────────────────────────────────
|
||||
from mlsys.constants import GPT3_PARAMS, param, BILLION
|
||||
from mlsys.constants import Models.GPT3.parameters, param, BILLION
|
||||
|
||||
class Gpt3ScaleScenario:
|
||||
class FrontierScaleScenario:
|
||||
"""GPT-3 scale reference for infrastructure."""
|
||||
|
||||
# ┌── 1. LOAD (Constants) ──────────────────────────────────────────────
|
||||
params = GPT3_PARAMS.m_as(param)
|
||||
params = Models.GPT3.parameters.m_as(param)
|
||||
|
||||
# ┌── 2. EXECUTE (The Compute) ────────────────────────────────────────
|
||||
params_b = params / BILLION
|
||||
@@ -197,10 +197,10 @@ class Gpt3ScaleScenario:
|
||||
gpt3_params_b = f"{params_b:.0f}"
|
||||
|
||||
# ┌── EXPORTS (Bridge to Text) ─────────────────────────────────────────────────
|
||||
gpt3_params_b = Gpt3ScaleScenario.gpt3_params_b
|
||||
frontier_params_b = FrontierScaleScenario.frontier_params_b_str
|
||||
```
|
||||
|
||||
To understand the magnitude of this challenge, consider our running example: a `{python} gpt3_params_b`B-parameter language model. In standard half-precision (FP16), the model's weights alone occupy 350 GB of memory. A single NVIDIA H100 accelerator -- the current standard for high-performance training -- provides 80 GB of High Bandwidth Memory. The model is more than 4$\times$ larger than the hardware intended to run it. When we account for the optimizer states and gradients required for training, the memory footprint exceeds 2 TB. This single arithmetic reality drives every architectural decision in this chapter: because the model cannot fit on a single chip, we are forced to build a distributed system.
|
||||
To understand the magnitude of this challenge, consider our running example: a `{python} frontier_params_b`B-parameter language model. In standard half-precision (FP16), the model's weights alone occupy 350 GB of memory. A single NVIDIA H100 accelerator -- the current standard for high-performance training -- provides 80 GB of High Bandwidth Memory. The model is more than 4$\times$ larger than the hardware intended to run it. When we account for the optimizer states and gradients required for training, the memory footprint exceeds 2 TB. This single arithmetic reality drives every architectural decision in this chapter: because the model cannot fit on a single chip, we are forced to build a distributed system.
|
||||
|
||||
Building this system requires us to expand outward from the silicon die in a series of concentric physical layers, each designed to overcome a specific fundamental constraint:
|
||||
|
||||
@@ -215,7 +215,7 @@ In the Fleet Stack model introduced in @sec-vol2-introduction, **Compute Infrast
|
||||
|
||||
:::
|
||||
|
||||
Even for machine learning engineers who will never crimp an Ethernet cable or debug a liquid cooling loop, the physical reality of the datacenter defines the *constraints* of their software. The decision to shard our `{python} gpt3_params_b`B model is not merely algorithmic but a direct response to the bandwidth hierarchy: tensor parallelism exploits ultra-fast intra-node NVLink, while pipeline parallelism mitigates the latency of slower inter-node links. The optimal batch size is determined by the Roofline Model, ensuring arithmetic intensity prevents the accelerator from stalling on memory fetches. The necessity of quantization is a concession to the Memory Wall. At scale, the frequency of checkpointing is a function of the Mean Time Between Failures (MTBF) of a 1,000-GPU pod. Every software optimization in distributed training is ultimately an attempt to navigate these physical boundaries, transforming system configuration from ad hoc heuristics into principled engineering grounded in the physics of latency, bandwidth, and failure rates.
|
||||
Even for machine learning engineers who will never crimp an Ethernet cable or debug a liquid cooling loop, the physical reality of the datacenter defines the *constraints* of their software. The decision to shard our `{python} frontier_params_b`B model is not merely algorithmic but a direct response to the bandwidth hierarchy: tensor parallelism exploits ultra-fast intra-node NVLink, while pipeline parallelism mitigates the latency of slower inter-node links. The optimal batch size is determined by the Roofline Model, ensuring arithmetic intensity prevents the accelerator from stalling on memory fetches. The necessity of quantization is a concession to the Memory Wall. At scale, the frequency of checkpointing is a function of the Mean Time Between Failures (MTBF) of a 1,000-GPU pod. Every software optimization in distributed training is ultimately an attempt to navigate these physical boundaries, transforming system configuration from ad hoc heuristics into principled engineering grounded in the physics of latency, bandwidth, and failure rates.
|
||||
|
||||
We begin our exploration at the center of this hierarchy, where the struggle to bridge the gap between CPU capability and AI demand first began: the accelerator.
|
||||
|
||||
@@ -245,7 +245,7 @@ These inversions explain why CPUs are poorly suited for large-scale ML and why a
|
||||
|
||||
## The Accelerator {#sec-compute-accelerator-core}
|
||||
|
||||
Suppose we want to train a `{python} gpt3_params_b`B-parameter language model. The arithmetic is straightforward: each training step requires roughly `{python} flops_per_token_math` floating-point operations per token, and we need to process trillions of tokens. A modern server CPU can deliver perhaps 1--2 TFLOPS of matrix throughput. At that rate, training would take decades.
|
||||
Suppose we want to train a `{python} frontier_params_b`B-parameter language model. The arithmetic is straightforward: each training step requires roughly `{python} flops_per_token_math` floating-point operations per token, and we need to process trillions of tokens. A modern server CPU can deliver perhaps 1--2 TFLOPS of matrix throughput. At that rate, training would take decades.
|
||||
|
||||
To put this in perspective, training GPT-3 on a single modern CPU would require approximately 355 years. Even with an aggressive 64-core server running at 2 TFLOPS aggregate, the computation would take over 5 years. The gap between CPU capability and the compute requirements of frontier models is not a small inefficiency that clever software can bridge; it is a chasm of 3--4 orders of magnitude that can only be crossed by fundamentally different hardware.
|
||||
|
||||
@@ -253,7 +253,7 @@ This gap has widened over time, not narrowed. In 2012, the compute required to t
|
||||
|
||||
The question that launched an entire hardware industry is therefore not *whether* to accelerate, but *how much generality to sacrifice* in exchange for the throughput that makes large-scale training feasible. This question does not have a single answer; it depends on the workload's predictability, the organization's scale, and the time horizon of the investment.
|
||||
|
||||
In practice, the "build versus buy" calculus for hardware acceleration is dominated not by silicon efficiency but by **software ecosystem lock-in**. While a custom ASIC might offer a 10$\times$ improvement in performance per watt for specific tensor operations by discarding the generality tax of GPU display engines and rasterization logic, the switching costs are often prohibitive. Migrating the training stack for our `{python} gpt3_params_b`B model from NVIDIA's CUDA to a proprietary ASIC environment requires rewriting highly optimized kernels for attention mechanisms, porting communication primitives like NCCL, and debugging numerical instabilities that vanish on established platforms. The **Non-Recurring Engineering (NRE)** cost for a modern 5nm ASIC design ranges from \$50 million to over \$200 million, with a design cycle of 24--36 months. By the time a custom chip optimized for a specific Transformer architecture returns from the fab, the state of the art may have shifted to Mixture-of-Experts or state-space models, rendering the hardware assumptions obsolete. Consequently, only hyperscalers with stable, massive-scale workloads can amortize this risk. For most organizations, the GPU premium is effectively an insurance policy against software incompatibility and architectural obsolescence.
|
||||
In practice, the "build versus buy" calculus for hardware acceleration is dominated not by silicon efficiency but by **software ecosystem lock-in**. While a custom ASIC might offer a 10$\times$ improvement in performance per watt for specific tensor operations by discarding the generality tax of GPU display engines and rasterization logic, the switching costs are often prohibitive. Migrating the training stack for our `{python} frontier_params_b`B model from NVIDIA's CUDA to a proprietary ASIC environment requires rewriting highly optimized kernels for attention mechanisms, porting communication primitives like NCCL, and debugging numerical instabilities that vanish on established platforms. The **Non-Recurring Engineering (NRE)** cost for a modern 5nm ASIC design ranges from \$50 million to over \$200 million, with a design cycle of 24--36 months. By the time a custom chip optimized for a specific Transformer architecture returns from the fab, the state of the art may have shifted to Mixture-of-Experts or state-space models, rendering the hardware assumptions obsolete. Consequently, only hyperscalers with stable, massive-scale workloads can amortize this risk. For most organizations, the GPU premium is effectively an insurance policy against software incompatibility and architectural obsolescence.
|
||||
|
||||
This question sits at the foundation of every fleet. The smallest physical unit of the machine learning fleet is the accelerator[^fn-accelerator-generality], and every accelerator is a physical response to three fundamental constraints that limit computation:
|
||||
|
||||
@@ -345,7 +345,7 @@ Every architecture decision in this chapter is a response to one or more of thre
|
||||
```
|
||||
:::
|
||||
|
||||
This chapter begins at the silicon die and expands outward through four physical levels. At each level, we encounter the same engineering pattern: a constraint becomes intolerable, and the solution creates the next level of infrastructure. The Node aggregates accelerators to overcome memory capacity limits. The Rack concentrates nodes and confronts power delivery and cooling. The Pod wires racks into a warehouse-scale computer and faces the communication wall at full force. By the end, we will have mapped the complete physical stack from transistor to datacenter, with our `{python} gpt3_params_b`B model serving as the thread that connects each level.
|
||||
This chapter begins at the silicon die and expands outward through four physical levels. At each level, we encounter the same engineering pattern: a constraint becomes intolerable, and the solution creates the next level of infrastructure. The Node aggregates accelerators to overcome memory capacity limits. The Rack concentrates nodes and confronts power delivery and cooling. The Pod wires racks into a warehouse-scale computer and faces the communication wall at full force. By the end, we will have mapped the complete physical stack from transistor to datacenter, with our `{python} frontier_params_b`B model serving as the thread that connects each level.
|
||||
|
||||
[^fn-accelerator-generality]: **Accelerator (ML Context)**: A CPU devotes 5--10% of die area to arithmetic; a GPU devotes 50--60%; a TPU 70--80%. This progression quantifies the generality-efficiency trade-off: every percentage point of die area reclaimed from control logic and caches becomes additional multiply-accumulate units, directly increasing peak TFLOPS per watt but narrowing the set of operations the chip can execute efficiently. \index{Accelerator!generality trade-off}
|
||||
|
||||
@@ -490,7 +490,7 @@ Wafer-scale engines sit at a unique point on the spectrum: they are highly speci
|
||||
|
||||
: **The Accelerator Spectrum**. As we move from left to right, we trade general-purpose programmability for compute density and power efficiency. Wafer-scale engines occupy a distinct niche, providing cluster-scale performance on a single piece of silicon. {#tbl-accelerator-comparison}
|
||||
|
||||
For our `{python} gpt3_params_b`B model, the choice is not purely about peak FLOPS. If we are a research lab experimenting with novel architectures weekly, the GPU's flexibility justifies its generality tax. If we are deploying a fixed Transformer at scale for years, the TPU's dataflow efficiency or a custom ASIC's power advantage may dominate total cost. The accelerator spectrum is ultimately an economic question: *how much flexibility can we afford to surrender, given the stability of our workload?*
|
||||
For our `{python} frontier_params_b`B model, the choice is not purely about peak FLOPS. If we are a research lab experimenting with novel architectures weekly, the GPU's flexibility justifies its generality tax. If we are deploying a fixed Transformer at scale for years, the TPU's dataflow efficiency or a custom ASIC's power advantage may dominate total cost. The accelerator spectrum is ultimately an economic question: *how much flexibility can we afford to surrender, given the stability of our workload?*
|
||||
|
||||
An emerging trend in accelerator design is the **chiplet** architecture, exemplified by NVIDIA's Blackwell and AMD's Instinct MI300 series. Rather than fabricating a single monolithic die, chiplet-based designs partition the processor into multiple smaller dies connected by a high-bandwidth die-to-die interconnect on a common package substrate. This approach addresses two physical limitations that constrain monolithic designs.
|
||||
|
||||
@@ -522,16 +522,16 @@ The **Hopper** architecture (2022) added the Transformer Engine, which dynamical
|
||||
# │ Show: ~1,979 TFLOPS FP16.
|
||||
# │ How: pulling constants from mlsys.constants.
|
||||
# │
|
||||
# │ Imports: mlsys.constants (H100_FLOPS_FP16_TENSOR, TFLOPs, second)
|
||||
# │ Imports: mlsys.constants (system.peak_flops, TFLOPs, second)
|
||||
# │ Exports: h100_tflops
|
||||
# └─────────────────────────────────────────────────────────────────────────────
|
||||
from mlsys.constants import H100_FLOPS_FP16_TENSOR, TFLOPs, second
|
||||
from mlsys.constants import system.peak_flops, TFLOPs, second
|
||||
|
||||
class H100PerformanceScenario:
|
||||
"""H100 peak performance reference."""
|
||||
|
||||
# ┌── 1. LOAD (Constants) ──────────────────────────────────────────────
|
||||
flops = H100_FLOPS_FP16_TENSOR
|
||||
flops = system.peak_flops
|
||||
|
||||
# ┌── 4. OUTPUT (Formatting) ──────────────────────────────────────────────
|
||||
h100_tflops = f"{flops.m_as(TFLOPs/second):.0f}"
|
||||
@@ -685,13 +685,13 @@ The accelerator's arithmetic engine is now extraordinarily powerful -- capable o
|
||||
|
||||
With the accelerator's arithmetic engine selected, we confront a paradox: the faster we make our logic, the more it idles waiting for data. This diverging trajectory between processor throughput and data access speed is formally known as the **Memory Wall**. While transistor scaling has driven logic performance up by orders of magnitude, the physical interconnects that feed data to these cores have failed to keep pace. This bottleneck is existential for machine learning: unlike traditional software that benefits heavily from caching and data reuse, neural networks must stream billions of weights from memory for every inference pass, often making bandwidth -- not compute -- the governing constraint on performance.
|
||||
|
||||
The implications are concrete and perceptible in our running example. Our `{python} gpt3_params_b`B model's weights occupy 350 GB in FP16. During autoregressive decoding, this entire 350 GB tensor must be streamed from off-chip memory into the processor's registers for *every single token* generated. Even at the highest available HBM bandwidths (~3.35 TB/s on an H100), this data movement alone dictates a latency floor of over 100 ms per token. The Memory Wall is not an abstract architectural concept -- it is the physical reason your chatbot takes a perceptible pause between words. To navigate this constraint, we turn to three critical engineering responses: **High Bandwidth Memory (HBM)** to widen the data pipe, the **Roofline Model** to rigorously diagnose whether a workload is starving for data or for compute, and **Tensor Cores** to maximize the arithmetic value of every byte fetched.
|
||||
The implications are concrete and perceptible in our running example. Our `{python} frontier_params_b`B model's weights occupy 350 GB in FP16. During autoregressive decoding, this entire 350 GB tensor must be streamed from off-chip memory into the processor's registers for *every single token* generated. Even at the highest available HBM bandwidths (~3.35 TB/s on an H100), this data movement alone dictates a latency floor of over 100 ms per token. The Memory Wall is not an abstract architectural concept -- it is the physical reason your chatbot takes a perceptible pause between words. To navigate this constraint, we turn to three critical engineering responses: **High Bandwidth Memory (HBM)** to widen the data pipe, the **Roofline Model** to rigorously diagnose whether a workload is starving for data or for compute, and **Tensor Cores** to maximize the arithmetic value of every byte fetched.
|
||||
|
||||
### HBM: Breaking the Memory Wall {#sec-compute-hbm}
|
||||
|
||||
\index{HBM}
|
||||
|
||||
With the accelerator's arithmetic engine selected, we confront a paradox. We can pack thousands of multipliers onto a single die, but if we cannot feed them with data fast enough, most of those multipliers sit idle on every clock cycle. Consider our `{python} gpt3_params_b`B model: its weights alone occupy 350 GB in FP16 precision. During the decode phase of text generation, the processor must stream the *entire* weight tensor through the arithmetic units for every single output token. The bottleneck is not the speed of multiplication; it is the speed of *delivery*. This is the Memory Wall in action: the processor is starved for data, not for arithmetic capability. No amount of additional Tensor Cores can help, because the existing ones are already idle for most of each cycle, waiting for the next tile of data to arrive from memory. The technology that defines the modern accelerator's response to this fundamental limitation is High Bandwidth Memory[^fn-hbm-origin].
|
||||
With the accelerator's arithmetic engine selected, we confront a paradox. We can pack thousands of multipliers onto a single die, but if we cannot feed them with data fast enough, most of those multipliers sit idle on every clock cycle. Consider our `{python} frontier_params_b`B model: its weights alone occupy 350 GB in FP16 precision. During the decode phase of text generation, the processor must stream the *entire* weight tensor through the arithmetic units for every single output token. The bottleneck is not the speed of multiplication; it is the speed of *delivery*. This is the Memory Wall in action: the processor is starved for data, not for arithmetic capability. No amount of additional Tensor Cores can help, because the existing ones are already idle for most of each cycle, waiting for the next tile of data to arrive from memory. The technology that defines the modern accelerator's response to this fundamental limitation is High Bandwidth Memory[^fn-hbm-origin].
|
||||
|
||||
The memory hierarchy within a single accelerator spans orders of magnitude in both capacity and bandwidth. At the top sits the **register file**[^fn-register-file-bandwidth] -- approximately 20--30 MB distributed across all SMs -- with effectively infinite bandwidth (hundreds of TB/s) but minuscule capacity. Below this lies the L1 cache and **shared memory** (SRAM)[^fn-sram-energy-efficiency], offering roughly 256 KB per SM (approximately 33 MB total) with an aggregate bandwidth of ~19 TB/s.
|
||||
|
||||
@@ -705,7 +705,7 @@ While **Archetype A (GPT-4)** is primarily throughput-bound (demanding more TFLO
|
||||
:::
|
||||
|
||||
The bandwidth gap between registers and HBM is approximately 1,000$\times$. If an operand must be fetched from HBM for a single operation, the arithmetic unit spends 99.9% of its time stalling.
|
||||
High Model FLOPS Utilization (MFU) is only possible through aggressive **tiling**: breaking the massive weight matrices into small tiles that fit entirely within shared memory and registers, then performing as many multiply-accumulate operations on each tile as possible before evicting it. Despite the `{python} gpt3_params_b`B model's massive total memory footprint, the active working set at any given microsecond must be meticulously managed to reside in that top 30 MB of register space, or the chip's theoretical performance becomes a mirage.
|
||||
High Model FLOPS Utilization (MFU) is only possible through aggressive **tiling**: breaking the massive weight matrices into small tiles that fit entirely within shared memory and registers, then performing as many multiply-accumulate operations on each tile as possible before evicting it. Despite the `{python} frontier_params_b`B model's massive total memory footprint, the active working set at any given microsecond must be meticulously managed to reside in that top 30 MB of register space, or the chip's theoretical performance becomes a mirage.
|
||||
|
||||
[^fn-hbm-origin]: **HBM (High Bandwidth Memory)**: Standardized by JEDEC in 2013 as a joint development between AMD and SK Hynix, originally for graphics cards. ML accelerators adopted HBM because neural networks exhibit the same bandwidth-hungry, capacity-moderate access pattern as high-end rendering. Each HBM generation has roughly doubled bandwidth (128 GB/s in HBM1 to 1.2 TB/s per stack in HBM3e), yet the gap between memory bandwidth and arithmetic throughput continues to widen -- making HBM a necessary but never sufficient response to the Memory Wall. \index{HBM!origin}
|
||||
|
||||
@@ -751,17 +751,17 @@ The interposer itself is a passive silicon substrate with etched wiring layers t
|
||||
# │ Show: ~200 GB/s DDR; ~3.35 TB/s HBM3.
|
||||
# │ How: pulling constants from mlsys.constants.
|
||||
# │
|
||||
# │ Imports: mlsys.constants (SYSTEM_MEMORY_BW, H100_MEM_BW, GB, TB, second)
|
||||
# │ Imports: mlsys.constants (SYSTEM_MEMORY_BW, system.memory_bw, GB, TB, second)
|
||||
# │ Exports: ddr_bw_str, h100_bw
|
||||
# └─────────────────────────────────────────────────────────────────────────────
|
||||
from mlsys.constants import SYSTEM_MEMORY_BW, H100_MEM_BW, GB, TB, second
|
||||
from mlsys.constants import SYSTEM_MEMORY_BW, system.memory_bw, GB, TB, second
|
||||
|
||||
class MemoryBandwidthScenario:
|
||||
"""Memory bandwidth comparison reference."""
|
||||
|
||||
# ┌── 1. LOAD (Constants) ──────────────────────────────────────────────
|
||||
ddr_bw = SYSTEM_MEMORY_BW
|
||||
hbm_bw = H100_MEM_BW
|
||||
hbm_bw = system.memory_bw
|
||||
|
||||
# ┌── 4. OUTPUT (Formatting) ──────────────────────────────────────────────
|
||||
ddr_bw_str = f"{ddr_bw.m_as(GB/second):.0f}"
|
||||
@@ -790,7 +790,7 @@ The supply chain dynamics of HBM production further affect its cost and availabi
|
||||
|
||||
For infrastructure planners, this supply chain concentration means that HBM availability, not just its specifications, can determine the timeline for building a training cluster. Organizations planning large deployments must secure HBM allocations 12--18 months in advance, committing capital before the rest of the system is designed. This procurement lead time is longer than for any other component in the stack, making HBM the pacing element for fleet expansion.
|
||||
|
||||
For our `{python} gpt3_params_b`B model, the HBM alone in a cluster of 1,000 accelerators might represent \$50--80 million in memory cost. This cost-capacity trade-off explains why accelerators typically offer 80--192 GB of HBM while the host server provides 512 GB to 2 TB of DDR: the *fast* memory holds the active computation (weights, activations, gradients that are accessed every cycle), and the *cheap* memory holds everything else (optimizer states, checkpoint buffers, data loading queues).
|
||||
For our `{python} frontier_params_b`B model, the HBM alone in a cluster of 1,000 accelerators might represent \$50--80 million in memory cost. This cost-capacity trade-off explains why accelerators typically offer 80--192 GB of HBM while the host server provides 512 GB to 2 TB of DDR: the *fast* memory holds the active computation (weights, activations, gradients that are accessed every cycle), and the *cheap* memory holds everything else (optimizer states, checkpoint buffers, data loading queues).
|
||||
|
||||
The boundary between what resides in HBM and what resides in DDR is a critical design parameter for training frameworks, and managing this boundary efficiently is one of the key challenges addressed by ZeRO optimization and offloading strategies (@sec-distributed-training-systems). Getting this boundary wrong in either direction is costly: placing too much data in HBM wastes expensive capacity, while placing too much in DDR creates bandwidth stalls that idle the arithmetic units.
|
||||
|
||||
@@ -807,9 +807,9 @@ The evolution of HBM tracks the growth of model sizes with close correspondence.
|
||||
|
||||
: **Evolution of High Bandwidth Memory**. Each generation roughly doubles bandwidth, tracking the doubling of frontier model sizes every 12--18 months. The jump to HBM4 doubles the interface width for the first time since HBM's introduction, signaling that pin-rate increases alone can no longer sustain the required bandwidth growth. {#tbl-hbm-evolution}
|
||||
|
||||
The transition from HBM3 to HBM3e is particularly significant for our running example. An A100 with 80 GB of HBM2e can hold only 23% of our `{python} gpt3_params_b`B model's weights (at FP16). An H100 with 80 GB of HBM3 can hold the same fraction but deliver the data 65% faster. A B200 with 192 GB of HBM3e can hold 55% of the weights and deliver them at over 4.8 TB/s. Neither can hold the full model, which is precisely why we need multiple accelerators in a node, a topic we address in @sec-compute-node.
|
||||
The transition from HBM3 to HBM3e is particularly significant for our running example. An A100 with 80 GB of HBM2e can hold only 23% of our `{python} frontier_params_b`B model's weights (at FP16). An H100 with 80 GB of HBM3 can hold the same fraction but deliver the data 65% faster. A B200 with 192 GB of HBM3e can hold 55% of the weights and deliver them at over 4.8 TB/s. Neither can hold the full model, which is precisely why we need multiple accelerators in a node, a topic we address in @sec-compute-node.
|
||||
|
||||
However, the capacity story changes significantly when quantization is applied. The same `{python} gpt3_params_b`B model quantized to INT8 requires only 175 GB, fitting in 3 H100 GPUs or a single B200. Quantized to INT4, it requires only 87.5 GB, fitting in a single H100. The capacity constraints that drive the need for multi-accelerator nodes for training (where FP16 or BF16 precision is typically required) are substantially relaxed for inference (where INT8 or INT4 quantization is often acceptable). This is another reason why training and inference infrastructure have different optimal configurations.
|
||||
However, the capacity story changes significantly when quantization is applied. The same `{python} frontier_params_b`B model quantized to INT8 requires only 175 GB, fitting in 3 H100 GPUs or a single B200. Quantized to INT4, it requires only 87.5 GB, fitting in a single H100. The capacity constraints that drive the need for multi-accelerator nodes for training (where FP16 or BF16 precision is typically required) are substantially relaxed for inference (where INT8 or INT4 quantization is often acceptable). This is another reason why training and inference infrastructure have different optimal configurations.
|
||||
|
||||
The bandwidth improvement matters independently of capacity. Each generation of HBM produces a nearly proportional reduction in per-token latency during autoregressive inference. A 70B model on an A100 (2.0 TB/s HBM2e bandwidth) generates tokens at roughly 70 GB / 2.0 TB/s = 35 ms per token. The same model on an H100 (3.35 TB/s HBM3) generates tokens at 70 GB / 3.35 TB/s = 20.9 ms per token, a 1.67$\times$ improvement. On a B200 (4.8 TB/s HBM3e), latency drops further to 70 GB / 4.8 TB/s = 14.6 ms per token. For interactive applications (chatbots, code assistants, real-time translation), where users perceive delays above 50 ms as "slow," these bandwidth improvements translate directly into better user experience and into the ability to serve larger models within latency budgets.
|
||||
|
||||
@@ -974,7 +974,7 @@ Any workload can be plotted as a single point on this chart by computing its ari
|
||||
|
||||
If the point lies below either line, the workload is not fully using the available resource. This gap indicates an optimization opportunity in the software: kernel inefficiency, poor memory access patterns, or excessive synchronization. Closing this gap is the province of kernel engineering and communication optimization, topics examined in @sec-distributed-training-systems.
|
||||
|
||||
The Roofline's diagnostic power extends beyond individual kernels to entire training runs. For our `{python} gpt3_params_b`B model, the computation graph contains thousands of distinct operations with different arithmetic intensities. The dense Feed-Forward Network (FFN) layers are dominated by large GEMMs with high arithmetic intensity, placing them firmly in the compute-bound regime where Tensor Core utilization is the bottleneck. Conversely, operations like layer normalization and element-wise activations possess low arithmetic intensity, sitting deep in the memory-bound region where the compute units idle while waiting for data. The self-attention mechanism fluctuates between regimes depending on sequence length: while the quadratic complexity of attention scores suggests a compute bound, the loading of Key and Value matrices creates memory pressure at shorter sequences. This diagnostic distinction dictates the optimization strategy: memory-bound layers benefit from kernel fusion (reducing HBM round-trips), while compute-bound layers benefit from precision reduction (moving from FP16 to FP8, which effectively raises the hardware's compute ceiling).
|
||||
The Roofline's diagnostic power extends beyond individual kernels to entire training runs. For our `{python} frontier_params_b`B model, the computation graph contains thousands of distinct operations with different arithmetic intensities. The dense Feed-Forward Network (FFN) layers are dominated by large GEMMs with high arithmetic intensity, placing them firmly in the compute-bound regime where Tensor Core utilization is the bottleneck. Conversely, operations like layer normalization and element-wise activations possess low arithmetic intensity, sitting deep in the memory-bound region where the compute units idle while waiting for data. The self-attention mechanism fluctuates between regimes depending on sequence length: while the quadratic complexity of attention scores suggests a compute bound, the loading of Key and Value matrices creates memory pressure at shorter sequences. This diagnostic distinction dictates the optimization strategy: memory-bound layers benefit from kernel fusion (reducing HBM round-trips), while compute-bound layers benefit from precision reduction (moving from FP16 to FP8, which effectively raises the hardware's compute ceiling).
|
||||
|
||||
@fig-roofline-landscape makes these relationships visible for the H100. Notice how LLM decode at batch size 1 sits deep in the memory-bound region, achieving less than 1% of peak compute, while LLM training at large batch sizes crosses the ridge point into the compute-bound regime. The 591$\times$ gap between these two workloads' arithmetic intensities explains why the same hardware that delivers excellent training throughput can appear woefully underutilized during inference.
|
||||
|
||||
@@ -1065,7 +1065,7 @@ For training, the batch size is a hyperparameter that affects both statistical c
|
||||
|
||||
The practical value of the Roofline Model is that it tells us *which resource to optimize*. If a workload is memory-bound, buying a faster accelerator (more TFLOPS) yields no benefit; only higher memory bandwidth will help. Conversely, if a workload is compute-bound, upgrading HBM generations is wasted money. This diagnostic power is the reason that experienced infrastructure engineers always begin a hardware selection process by computing the arithmetic intensity of their target workload and plotting it against the candidate hardware's Roofline: the plot immediately reveals which hardware characteristic matters and which is irrelevant.
|
||||
|
||||
For our `{python} gpt3_params_b`B model, training with large batch sizes is compute-bound (optimize for TFLOPS), while serving individual requests is memory-bound (optimize for bandwidth). This duality explains why some organizations use different hardware generations for training and inference. An A100, with its lower cost and adequate memory bandwidth, may be more cost-effective for inference than an H100, despite the H100's higher peak FLOPS.
|
||||
For our `{python} frontier_params_b`B model, training with large batch sizes is compute-bound (optimize for TFLOPS), while serving individual requests is memory-bound (optimize for bandwidth). This duality explains why some organizations use different hardware generations for training and inference. An A100, with its lower cost and adequate memory bandwidth, may be more cost-effective for inference than an H100, despite the H100's higher peak FLOPS.
|
||||
|
||||
The Roofline Model also provides a quantitative framework for evaluating the return on investment of different optimizations. If a workload is 10$\times$ below the compute ceiling but already touching the bandwidth ceiling, spending engineering effort on kernel optimization (moving toward the compute ceiling) yields no benefit. The effort should instead be directed toward reducing memory traffic (shifting the workload rightward on the plot) through techniques like batching, kernel fusion, or quantization.
|
||||
|
||||
@@ -1085,19 +1085,19 @@ This diagnostic power makes the Roofline Model one of the most practically usefu
|
||||
# │ Show: ~1,979 TFLOPS; ~3.35 TB/s; ~591 FLOP/byte ridge.
|
||||
# │ How: pulling constants from mlsys.constants.
|
||||
# │
|
||||
# │ Imports: mlsys.constants (H100_FLOPS_FP16_TENSOR, H100_MEM_BW, GPT3_PARAMS,
|
||||
# │ Imports: mlsys.constants (system.peak_flops, system.memory_bw, Models.GPT3.parameters,
|
||||
# │ TFLOPs, TB, second, param, BILLION, flop, byte)
|
||||
# │ Exports: gpt3_params_b, h100_tflops, h100_bw, h100_ridge
|
||||
# └─────────────────────────────────────────────────────────────────────────────
|
||||
from mlsys.constants import H100_FLOPS_FP16_TENSOR, H100_MEM_BW, GPT3_PARAMS, TFLOPs, TB, second, param, BILLION, flop, byte
|
||||
from mlsys.constants import system.peak_flops, system.memory_bw, Models.GPT3.parameters, TFLOPs, TB, second, param, BILLION, flop, byte
|
||||
|
||||
class RooflineScenario:
|
||||
"""H100 and GPT-3 specs for roofline analysis."""
|
||||
|
||||
# ┌── 1. LOAD (Constants) ──────────────────────────────────────────────
|
||||
flops = H100_FLOPS_FP16_TENSOR
|
||||
bw = H100_MEM_BW
|
||||
gpt3_params = GPT3_PARAMS.m_as(param)
|
||||
flops = system.peak_flops
|
||||
bw = system.memory_bw
|
||||
gpt3_params = Models.GPT3.parameters.m_as(param)
|
||||
|
||||
# ┌── 2. EXECUTE (The Compute) ────────────────────────────────────────
|
||||
ridge = (flops / bw).m_as(flop/byte)
|
||||
@@ -1116,7 +1116,7 @@ h100_bw = RooflineScenario.h100_bw
|
||||
h100_ridge = RooflineScenario.h100_ridge
|
||||
```
|
||||
|
||||
Consider our `{python} gpt3_params_b`B model on an H100 with `{python} h100_tflops` TFLOPS peak compute and `{python} h100_bw` TB/s memory bandwidth. The ridge point is `{python} h100_ridge` FLOP/byte.
|
||||
Consider our `{python} frontier_params_b`B model on an H100 with `{python} h100_tflops` TFLOPS peak compute and `{python} h100_bw` TB/s memory bandwidth. The ridge point is `{python} h100_ridge` FLOP/byte.
|
||||
|
||||
**Inference (Decode, Batch Size 1)**: Each token loads 350 GB of FP16 weights and performs `{python} inf_flops_math` FLOPs. Arithmetic intensity: $I = 350 \times 10^9 / 350 \times 10^9 = 1.0$ FLOP/byte. Since $1.0 \ll$ `{python} h100_ridge` (the ridge point), the workload is deeply memory-bound. The achievable throughput is `{python} inf_throughput_math` TFLOPS, which is less than 0.2% of the H100's peak `{python} h100_tflops` TFLOPS. No amount of additional compute will help; only more memory bandwidth improves throughput.
|
||||
|
||||
@@ -1146,7 +1146,7 @@ With 528 Tensor Cores across the chip (4 per SM, 132 SMs), the H100 can execute
|
||||
|
||||
Google's MXUs take the same concept further by organizing the multipliers into a systolic array. In a systolic MXU, one matrix is loaded into the array's weight registers while the other matrix streams through the array one row at a time. Each cell multiplies the incoming activation by its stored weight, adds the result to the partial sum flowing from the cell above, and passes both the activation rightward and the partial sum downward. This pipelined flow means the array is performing useful computation on every cycle, with no idle cells once the pipeline is full. The TPU v5p contains two $128\times128$ MXUs per chip, providing 459 TFLOPS of BF16 throughput. The systolic dataflow eliminates the register file accesses between operations that a Tensor Core still requires, achieving marginally higher energy efficiency at the cost of the flexibility to run non-matrix workloads.
|
||||
|
||||
For our `{python} gpt3_params_b`B model, the choice between Tensor Cores and MXUs manifests in the compiler stack. CUDA kernels can intermix Tensor Core operations with arbitrary thread-level code, enabling fused kernels that combine matrix multiplies with activation functions, dropout, and layer normalization in a single launch. This fusion is critical for performance because it eliminates the intermediate memory reads and writes that would otherwise occur between operations, keeping data in registers and shared memory where access is fastest.
|
||||
For our `{python} frontier_params_b`B model, the choice between Tensor Cores and MXUs manifests in the compiler stack. CUDA kernels can intermix Tensor Core operations with arbitrary thread-level code, enabling fused kernels that combine matrix multiplies with activation functions, dropout, and layer normalization in a single launch. This fusion is critical for performance because it eliminates the intermediate memory reads and writes that would otherwise occur between operations, keeping data in registers and shared memory where access is fastest.
|
||||
|
||||
XLA compilation for TPUs must decompose the computation into sequences of matrix operations that map onto the systolic dataflow, which can be more efficient for standard Transformer architectures but less flexible for custom operations. The XLA compiler performs whole-program optimization, analyzing the entire computation graph to find the optimal tiling, memory layout, and execution schedule for the systolic array. For standard Transformer layers, this whole-program optimization can achieve higher hardware utilization than hand-tuned CUDA kernels, because the compiler can reason about the entire computation rather than optimizing individual operations in isolation.
|
||||
|
||||
@@ -1176,7 +1176,7 @@ The Transformer Engine selects between these formats on a per-layer basis, and t
|
||||
|
||||
The practical implication for fleet design is that peak TFLOPS specifications are precision-dependent. The H100 delivers `{python} h100_tflops` TFLOPS in FP16 but twice that (3,958 TFLOPS) in FP8. A fleet designed for FP8 training effectively has twice the compute density of the same fleet running FP16, with no additional hardware. This makes precision engineering a first-class optimization lever for infrastructure planners, not just a model accuracy concern.
|
||||
|
||||
To achieve this peak throughput in practice, the entire accelerator must be viewed as a rigid pipeline where data flows from HBM through a deepening hierarchy of caches before reaching the Tensor Cores. The fundamental constraint is **pipeline balance**: the rate at which data is staged into registers must match or exceed the rate at which the arithmetic units consume it. When this balance breaks -- when the arithmetic intensity of an operation falls below the ridge point -- the pipeline stalls, leaving teraflops of compute potential idle while waiting for data. This makes **kernel fusion** the single most critical software optimization for large-scale training. By fusing multiple operations (matrix multiplication, bias addition, activation function) into a single kernel, the system eliminates the round-trips to HBM that would occur if each operation were executed sequentially. Consider the attention mechanism: a naive, unfused implementation writes the $N \times N$ attention matrix to HBM only to read it back for the softmax operation, a round trip capped by the 3.35 TB/s memory bandwidth. A fused implementation like FlashAttention keeps these intermediate matrices entirely in on-chip SRAM, bypassing HBM and allowing the Tensor Cores to run near their theoretical peak. For our `{python} gpt3_params_b`B model, where each training step involves thousands of matrix operations across 96 Transformer layers, the difference between fused and unfused kernels can be a 2--3$\times$ throughput improvement -- the difference between a 2-week and a 6-week training run.
|
||||
To achieve this peak throughput in practice, the entire accelerator must be viewed as a rigid pipeline where data flows from HBM through a deepening hierarchy of caches before reaching the Tensor Cores. The fundamental constraint is **pipeline balance**: the rate at which data is staged into registers must match or exceed the rate at which the arithmetic units consume it. When this balance breaks -- when the arithmetic intensity of an operation falls below the ridge point -- the pipeline stalls, leaving teraflops of compute potential idle while waiting for data. This makes **kernel fusion** the single most critical software optimization for large-scale training. By fusing multiple operations (matrix multiplication, bias addition, activation function) into a single kernel, the system eliminates the round-trips to HBM that would occur if each operation were executed sequentially. Consider the attention mechanism: a naive, unfused implementation writes the $N \times N$ attention matrix to HBM only to read it back for the softmax operation, a round trip capped by the 3.35 TB/s memory bandwidth. A fused implementation like FlashAttention keeps these intermediate matrices entirely in on-chip SRAM, bypassing HBM and allowing the Tensor Cores to run near their theoretical peak. For our `{python} frontier_params_b`B model, where each training step involves thousands of matrix operations across 96 Transformer layers, the difference between fused and unfused kernels can be a 2--3$\times$ throughput improvement -- the difference between a 2-week and a 6-week training run.
|
||||
|
||||
#### Peak vs. Sustained Throughput
|
||||
|
||||
@@ -1378,14 +1378,14 @@ At fleet scale, DVFS also affects power planning. The datacenter's power deliver
|
||||
# │ How: pulling constants from mlsys.constants.
|
||||
# │
|
||||
# │ Imports: mlsys.constants (V100_FLOPS_FP16_TENSOR, V100_TDP,
|
||||
# │ A100_FLOPS_FP16_TENSOR, A100_TDP, H100_FLOPS_FP16_TENSOR,
|
||||
# │ A100_FLOPS_FP16_TENSOR, A100_TDP, system.peak_flops,
|
||||
# │ H100_TDP, B200_FLOPS_FP16_TENSOR, B200_TDP, TFLOPs, second, watt)
|
||||
# │ Exports: v100_ef, a100_ef, h100_ef, b200_ef
|
||||
# └─────────────────────────────────────────────────────────────────────────────
|
||||
from mlsys.constants import (
|
||||
V100_FLOPS_FP16_TENSOR, V100_TDP,
|
||||
A100_FLOPS_FP16_TENSOR, A100_TDP,
|
||||
H100_FLOPS_FP16_TENSOR, H100_TDP,
|
||||
system.peak_flops, H100_TDP,
|
||||
B200_FLOPS_FP16_TENSOR, B200_TDP,
|
||||
TFLOPs, second, watt
|
||||
)
|
||||
@@ -1398,7 +1398,7 @@ class GpuEfficiencyScenario:
|
||||
v100_tdp = V100_TDP
|
||||
a100_f16 = A100_FLOPS_FP16_TENSOR
|
||||
a100_tdp = A100_TDP
|
||||
h100_f16 = H100_FLOPS_FP16_TENSOR
|
||||
h100_f16 = system.peak_flops
|
||||
h100_tdp = H100_TDP
|
||||
b200_f16 = B200_FLOPS_FP16_TENSOR
|
||||
b200_tdp = B200_TDP
|
||||
@@ -1432,7 +1432,7 @@ The implications of TDP for fleet design are profound and will recur throughout
|
||||
|
||||
The preceding sections have established the physical characteristics of accelerators: compute throughput, memory bandwidth, memory capacity, power consumption, and the trade-offs between generality and efficiency. How do these characteristics map to the workloads that a fleet must support? The answer depends on the workload's position on the Roofline plot and its specific resource requirements.
|
||||
|
||||
Large language model training -- our `{python} gpt3_params_b`B running example -- is typically compute-bound during the forward and backward passes because large batch sizes push the arithmetic intensity above the ridge point. The dominant hardware requirement is peak TFLOPS, with memory capacity being a constraint that determines the minimum number of accelerators needed to hold the model state. The H100 and B200 are well-suited for this workload, as are TPU v5p pods for organizations willing to accept the XLA compilation requirement. The choice between GPU and TPU often comes down to software ecosystem compatibility: PyTorch-native research teams prefer GPUs, while JAX-based teams can take advantage of TPU's cost efficiency.
|
||||
Large language model training -- our `{python} frontier_params_b`B running example -- is typically compute-bound during the forward and backward passes because large batch sizes push the arithmetic intensity above the ridge point. The dominant hardware requirement is peak TFLOPS, with memory capacity being a constraint that determines the minimum number of accelerators needed to hold the model state. The H100 and B200 are well-suited for this workload, as are TPU v5p pods for organizations willing to accept the XLA compilation requirement. The choice between GPU and TPU often comes down to software ecosystem compatibility: PyTorch-native research teams prefer GPUs, while JAX-based teams can take advantage of TPU's cost efficiency.
|
||||
|
||||
Large language model inference is overwhelmingly memory-bound during the autoregressive decode phase, as the token latency analysis demonstrated. The key hardware metric is memory bandwidth per dollar, not peak TFLOPS. For batch-1 serving (single user), the H100's `{python} h100_bw` TB/s bandwidth determines throughput, and the compute units sit almost entirely idle. For batched serving (multiple concurrent users), increasing the batch size raises the arithmetic intensity, gradually shifting the workload toward the compute-bound regime. This shift explains why serving systems aggressively batch requests: it transforms a memory-bound workload into one that can actually use the expensive compute silicon.
|
||||
|
||||
@@ -1491,7 +1491,7 @@ The infrastructure implications are substantial. A fleet designed for BF16 train
|
||||
|
||||
However, not all workloads can use FP8 without accuracy degradation. The reduced mantissa precision (3 bits in E4M3) means that values must be scaled carefully to avoid overflow (values exceeding the representable range are clamped to infinity) or underflow (small values rounded to zero). The Transformer Engine's dynamic per-tensor scaling addresses this challenge for standard Transformer architectures, but custom model architectures with unusual activation distributions may require manual precision tuning. The infrastructure team must therefore work closely with the model team to determine the lowest precision that maintains acceptable accuracy, as this decision directly affects the effective throughput and therefore the required cluster size.
|
||||
|
||||
For inference, **quantization** stands as the single most impactful optimization for serving economics, directly altering the hardware topology required for large models. Our `{python} gpt3_params_b`B model at FP16 requires 350 GB, necessitating a minimum of five 80 GB H100 GPUs simply to load the parameters. At INT8, this drops to 175 GB, fitting on three GPUs. At INT4, it shrinks to 87.5 GB, fitting on two GPUs. Because inference is strictly memory-bandwidth-bound, reading 4-bit weights instead of 16-bit weights effectively quadruples the available bandwidth for weight loading, proportionally reducing per-token latency. A production team deploying at INT4 rather than FP16 reduces their inference fleet by 4$\times$, saving millions of dollars annually at scale while incurring less than 1% quality loss on standard benchmarks when using group quantization techniques.
|
||||
For inference, **quantization** stands as the single most impactful optimization for serving economics, directly altering the hardware topology required for large models. Our `{python} frontier_params_b`B model at FP16 requires 350 GB, necessitating a minimum of five 80 GB H100 GPUs simply to load the parameters. At INT8, this drops to 175 GB, fitting on three GPUs. At INT4, it shrinks to 87.5 GB, fitting on two GPUs. Because inference is strictly memory-bandwidth-bound, reading 4-bit weights instead of 16-bit weights effectively quadruples the available bandwidth for weight loading, proportionally reducing per-token latency. A production team deploying at INT4 rather than FP16 reduces their inference fleet by 4$\times$, saving millions of dollars annually at scale while incurring less than 1% quality loss on standard benchmarks when using group quantization techniques.
|
||||
|
||||
For training, modern workloads use **mixed-precision training**, a strategy that decouples storage precision from arithmetic precision. A "master copy" of weights remains in FP32 for numerical stability, while the computationally intensive forward and backward passes are cast to BF16 or FP16 to exploit the full throughput of Tensor Cores. Gradients are accumulated in FP32 to prevent underflow before the optimizer updates the master weights. The H100's Transformer Engine extends this paradigm by dynamically selecting between FP8 and FP16 on a per-layer basis, potentially doubling throughput again for layers resilient to reduced precision. The cumulative effect on training velocity is profound: a training run that demands four weeks in pure FP32 can complete in approximately one week using a mixed FP16/FP8 strategy, with negligible quality degradation.
|
||||
|
||||
@@ -1513,7 +1513,7 @@ The iron law of modern computing is that moving data costs significantly more en
|
||||
|
||||
Performing one FP16 multiply-accumulate (**MAC**) operation -- the atomic unit of deep learning -- consumes approximately **1 picojoule (pJ)**. This is the baseline cost of useful work. Reading a single FP16 operand (16 bits) from HBM consumes roughly 4 pJ per bit, totaling **64 pJ**. Reading that same operand from off-package DRAM costs approximately 20 pJ per bit, or **320 pJ**. The asymmetry is staggering: reading a value from HBM costs 64$\times$ more energy than computing on it. Retrieving it from standard DRAM costs 320$\times$ more.
|
||||
|
||||
The macro impact appears when serving our `{python} gpt3_params_b`B model. Generating a single token requires loading the entire 350 GB weight tensor from HBM:
|
||||
The macro impact appears when serving our `{python} frontier_params_b`B model. Generating a single token requires loading the entire 350 GB weight tensor from HBM:
|
||||
|
||||
$$\text{Data Movement Energy} = 350 \text{ GB} \times 8 \text{ bits/byte} \times 4 \text{ pJ/bit} \approx \mathbf{11.2 \text{ Joules}}$$
|
||||
|
||||
@@ -1535,7 +1535,7 @@ MLPerf Inference measures the throughput and latency of serving trained models u
|
||||
|
||||
A critical subtlety in interpreting MLPerf results is the distinction between **closed** and **open** divisions. The closed division requires all submissions to use the same model architecture, hyperparameters, and training recipe, isolating the hardware and system software as the only variables. This makes closed-division results directly comparable across vendors. The open division allows arbitrary model modifications, software optimizations, and custom kernels, which can demonstrate the ceiling of a platform's capability but makes cross-vendor comparison unreliable. Infrastructure teams evaluating procurement decisions should weight closed-division results more heavily, as they reflect the performance an organization will achieve with standard frameworks and configurations, not the performance achievable only by the vendor's own optimization team.
|
||||
|
||||
For our `{python} gpt3_params_b`B model, no single benchmark captures the full picture. The training phase is compute-bound at large batch sizes, making peak TFLOPS and scaling efficiency the dominant metrics. The inference phase at batch size 1 is memory-bandwidth-bound, making GB/s per dollar the relevant metric. The fine-tuning phase falls between these extremes, with moderate batch sizes placing the workload near the Roofline ridge point. A comprehensive evaluation must benchmark all three phases on the candidate hardware, using the organization's actual model and data pipeline rather than relying solely on published MLPerf numbers.
|
||||
For our `{python} frontier_params_b`B model, no single benchmark captures the full picture. The training phase is compute-bound at large batch sizes, making peak TFLOPS and scaling efficiency the dominant metrics. The inference phase at batch size 1 is memory-bandwidth-bound, making GB/s per dollar the relevant metric. The fine-tuning phase falls between these extremes, with moderate batch sizes placing the workload near the Roofline ridge point. A comprehensive evaluation must benchmark all three phases on the candidate hardware, using the organization's actual model and data pipeline rather than relying solely on published MLPerf numbers.
|
||||
|
||||
::: {.callout-perspective title="Beyond Peak Specifications"}
|
||||
|
||||
@@ -1569,7 +1569,7 @@ These different requirements drive different hardware and architecture choices:
|
||||
|
||||
These differences mean that the optimal serving infrastructure often uses different hardware, different software, and different facility designs than the training infrastructure. Some organizations use previous-generation GPUs (A100s) for serving because the memory bandwidth per dollar is competitive with current-generation GPUs, and the lower TDP (400 W vs. 700 W) allows higher rack density and lower cooling costs.
|
||||
|
||||
The inference workload itself bifurcates into two distinct phases with opposing bottlenecks. The **prefill** phase processes the input prompt, performing a large matrix-matrix multiplication that is compute-bound and achieves high Tensor Core utilization. The **decode** phase generates tokens one by one, performing matrix-vector multiplications that are deeply memory-bandwidth-bound. To maximize hardware utilization, modern serving systems employ **continuous batching**, which schedules requests at the iteration level rather than the request level. This allows the engine to inject a new prefill computation for a waiting request into the idle compute slots of an ongoing decode batch, dynamically filling the GPU's arithmetic pipelines. For our `{python} gpt3_params_b`B model, serving 10,000 requests per second with a 50 ms time-to-first-token SLA requires distributing traffic across hundreds of 8-GPU replicas (each holding the full model via tensor parallelism), with a load balancer routing requests to the replica with the lowest current queue depth. Unlike training, where 99.9% reliability is acceptable via checkpointing, inference architectures must account for tail latency, where a single slow replica can violate the SLA for the entire request batch.
|
||||
The inference workload itself bifurcates into two distinct phases with opposing bottlenecks. The **prefill** phase processes the input prompt, performing a large matrix-matrix multiplication that is compute-bound and achieves high Tensor Core utilization. The **decode** phase generates tokens one by one, performing matrix-vector multiplications that are deeply memory-bandwidth-bound. To maximize hardware utilization, modern serving systems employ **continuous batching**, which schedules requests at the iteration level rather than the request level. This allows the engine to inject a new prefill computation for a waiting request into the idle compute slots of an ongoing decode batch, dynamically filling the GPU's arithmetic pipelines. For our `{python} frontier_params_b`B model, serving 10,000 requests per second with a 50 ms time-to-first-token SLA requires distributing traffic across hundreds of 8-GPU replicas (each holding the full model via tensor parallelism), with a load balancer routing requests to the replica with the lowest current queue depth. Unlike training, where 99.9% reliability is acceptable via checkpointing, inference architectures must account for tail latency, where a single slow replica can violate the SLA for the entire request batch.
|
||||
|
||||
#### The Accelerator Decision Matrix
|
||||
|
||||
@@ -1587,7 +1587,7 @@ The preceding sections have established the physical characteristics, analytical
|
||||
|
||||
: **The Accelerator Decision Matrix**. The optimal accelerator depends not on peak specifications but on which physical resource -- compute, bandwidth, or capacity -- is the binding constraint for the target workload. The Roofline Model provides the diagnostic: compute the arithmetic intensity, locate the workload relative to the ridge point, and select hardware that maximizes the binding resource per dollar. {#tbl-accelerator-decision}
|
||||
|
||||
For our `{python} gpt3_params_b`B model, the decision depends on the phase of the model's lifecycle. During training, large batch sizes push the workload into the compute-bound regime, making peak TFLOPS the dominant metric. The H100 or B200, with their high Tensor Core throughput and fast NVLink for tensor parallelism, are the natural choices. During inference at low batch sizes, the same model becomes deeply memory-bound, and the relevant metric shifts to bandwidth per dollar. An A100 with adequate HBM bandwidth at a lower price point may deliver better cost-per-token than an H100 whose additional TFLOPS go unused.
|
||||
For our `{python} frontier_params_b`B model, the decision depends on the phase of the model's lifecycle. During training, large batch sizes push the workload into the compute-bound regime, making peak TFLOPS the dominant metric. The H100 or B200, with their high Tensor Core throughput and fast NVLink for tensor parallelism, are the natural choices. During inference at low batch sizes, the same model becomes deeply memory-bound, and the relevant metric shifts to bandwidth per dollar. An A100 with adequate HBM bandwidth at a lower price point may deliver better cost-per-token than an H100 whose additional TFLOPS go unused.
|
||||
|
||||
The organizational dimension adds a further consideration. Research labs that modify model architectures weekly need the flexibility of the GPU's CUDA ecosystem, where custom kernels can be written and tested in hours. Production teams running a fixed Transformer architecture at scale for months may benefit from TPUs, where the XLA compiler's whole-program optimization can achieve higher sustained utilization than hand-tuned CUDA kernels for standard operations. Custom ASICs make economic sense only for organizations with enough scale to amortize the \$50--200 million NRE cost and enough workload stability to justify a 2--3 year design cycle. The accelerator spectrum is ultimately an economic question answered at the intersection of workload physics, organizational scale, and time horizon.
|
||||
|
||||
@@ -1597,7 +1597,7 @@ Your team needs to deploy a 70B-parameter model for both training and inference.
|
||||
|
||||
:::
|
||||
|
||||
With the accelerator's physics established, we face a concrete problem. Our `{python} gpt3_params_b`B model requires 350 GB of memory for its weights alone in FP16, and training with Adam optimizer states roughly triples that requirement to over 1 TB. A single H100 provides 80 GB of HBM. No single accelerator can hold this model. We must expand to the next physical level: the node.
|
||||
With the accelerator's physics established, we face a concrete problem. Our `{python} frontier_params_b`B model requires 350 GB of memory for its weights alone in FP16, and training with Adam optimizer states roughly triples that requirement to over 1 TB. A single H100 provides 80 GB of HBM. No single accelerator can hold this model. We must expand to the next physical level: the node.
|
||||
|
||||
## The Node {#sec-compute-node}
|
||||
|
||||
@@ -1605,7 +1605,7 @@ With the accelerator's physics established, we face a concrete problem. Our `{py
|
||||
\index{NVLink}
|
||||
\index{NVSwitch}
|
||||
|
||||
Our `{python} gpt3_params_b`B model needs 350 GB for its FP16 weights alone. With Adam optimizer states (which store first and second moments of every parameter), the total memory requirement for training exceeds 1 TB. A single H100 provides 80 GB of HBM. We need at least 5 accelerators just for the weights, and with optimizer state, all 8 in a dense node are barely sufficient. This arithmetic forces us beyond the single die to the next physical level: the **node**.
|
||||
Our `{python} frontier_params_b`B model needs 350 GB for its FP16 weights alone. With Adam optimizer states (which store first and second moments of every parameter), the total memory requirement for training exceeds 1 TB. A single H100 provides 80 GB of HBM. We need at least 5 accelerators just for the weights, and with optimizer state, all 8 in a dense node are barely sufficient. This arithmetic forces us beyond the single die to the next physical level: the **node**.
|
||||
|
||||
::: {.callout-definition title="Node"}
|
||||
|
||||
@@ -1623,7 +1623,7 @@ The node bridges this gap by aggregating the HBM of multiple accelerators into a
|
||||
|
||||
The economic argument for multi-accelerator nodes is equally compelling. Consider the alternative: building a single accelerator with enough HBM to hold a 175B model (350 GB in FP16). This would require approximately 4--5 HBM stacks at current capacities, with a total interposer area exceeding 2,000 mm$^2$. The manufacturing cost of such a package would be prohibitive (yield decreases exponentially with area), and the power delivery to a single chip with enough Tensor Cores to use all that bandwidth would exceed any practical cooling solution. By distributing the computation across 8 accelerators connected by NVLink, the node achieves the aggregate memory capacity and compute throughput of this hypothetical super-chip while remaining within manufacturable and coolable boundaries.
|
||||
|
||||
Understanding how the node partitions memory across its components is essential for selecting parallelism strategies. Consider the memory budget for training our `{python} gpt3_params_b`B model. The model weights in FP16 require 350 GB. The Adam optimizer maintains two additional copies of every parameter (first and second moments), adding another 700 GB in FP32 precision. Gradients require another 350 GB. The total is approximately 1.4 TB, far exceeding a single accelerator's 80 GB HBM but within reach of a node's aggregate capacity.
|
||||
Understanding how the node partitions memory across its components is essential for selecting parallelism strategies. Consider the memory budget for training our `{python} frontier_params_b`B model. The model weights in FP16 require 350 GB. The Adam optimizer maintains two additional copies of every parameter (first and second moments), adding another 700 GB in FP32 precision. Gradients require another 350 GB. The total is approximately 1.4 TB, far exceeding a single accelerator's 80 GB HBM but within reach of a node's aggregate capacity.
|
||||
|
||||
When using ZeRO optimization (which shards optimizer states, gradients, and optionally parameters across data-parallel workers), each GPU in an 8-GPU node holds only 1/8 of the optimizer state, reducing the per-GPU memory requirement to roughly 175 GB of equivalent storage. This still exceeds 80 GB per GPU, necessitating techniques like activation checkpointing (recomputing intermediate activations during the backward pass rather than storing them) and offloading optimizer states to host DRAM over PCIe.
|
||||
|
||||
@@ -1711,7 +1711,7 @@ This is why TP is confined to within a single node, while data parallelism (whic
|
||||
|
||||
The hierarchy also explains why pipeline parallelism occupies an intermediate position in the bandwidth requirements. In pipeline parallelism, the model is divided into sequential stages, with each stage assigned to a different group of accelerators. The communication between stages consists of activations flowing forward during the forward pass and gradients flowing backward during the backward pass, with a volume proportional to the batch size times the hidden dimension.
|
||||
|
||||
For our `{python} gpt3_params_b`B model with hidden dimension 12,288 and a microbatch of 4 sequences at 2,048 tokens, the activation tensor at each stage boundary is approximately $4 \times 2{,}048 \times 12{,}288 \times 2$ bytes (FP16) $\approx$ 200 MB. This is far smaller than the 350 GB gradient AllReduce required by data parallelism, which is why pipeline parallelism places less demanding requirements on the inter-node network.
|
||||
For our `{python} frontier_params_b`B model with hidden dimension 12,288 and a microbatch of 4 sequences at 2,048 tokens, the activation tensor at each stage boundary is approximately $4 \times 2{,}048 \times 12{,}288 \times 2$ bytes (FP16) $\approx$ 200 MB. This is far smaller than the 350 GB gradient AllReduce required by data parallelism, which is why pipeline parallelism places less demanding requirements on the inter-node network.
|
||||
|
||||
This communication occurs once per microbatch per stage boundary, which is far less frequent than tensor parallelism's per-layer AllReduce. Pipeline parallelism can therefore tolerate the lower bandwidth of inter-node links, making it the preferred strategy for spanning multiple nodes when the model's depth exceeds a single node's capacity.
|
||||
|
||||
@@ -1758,20 +1758,20 @@ Consider the NVIDIA DGX H100 architecture. Eight H100 GPUs sit on a single baseb
|
||||
# │
|
||||
# │ Goal: Provide aggregate node-level compute and memory capacity.
|
||||
# │ Show: ~640 GB aggregate HBM; ~15.8 PFLOPS aggregate FP16.
|
||||
# │ How: H100_MEM_CAPACITY * GPUS_PER_HOST; H100_FLOPS_FP16_TENSOR * GPUS_PER_HOST.
|
||||
# │ How: system.ram * GPUS_PER_HOST; system.peak_flops * GPUS_PER_HOST.
|
||||
# │
|
||||
# │ Imports: mlsys.constants (H100_MEM_CAPACITY, H100_FLOPS_FP16_TENSOR,
|
||||
# │ Imports: mlsys.constants (system.ram, system.peak_flops,
|
||||
# │ GPUS_PER_HOST, GB, TFLOPs, second)
|
||||
# │ Exports: node_hbm_cap, h100_tflops
|
||||
# └─────────────────────────────────────────────────────────────────────────────
|
||||
from mlsys.constants import H100_MEM_CAPACITY, H100_FLOPS_FP16_TENSOR, GPUS_PER_HOST, GB, TFLOPs, second
|
||||
from mlsys.constants import system.ram, system.peak_flops, GPUS_PER_HOST, GB, TFLOPs, second
|
||||
|
||||
class NodeCapacityScenario:
|
||||
"""Namespace for aggregate node capacity."""
|
||||
|
||||
# ┌── 1. LOAD (Constants) ──────────────────────────────────────────────
|
||||
mem = H100_MEM_CAPACITY
|
||||
flops = H100_FLOPS_FP16_TENSOR
|
||||
mem = system.ram
|
||||
flops = system.peak_flops
|
||||
n_gpus = GPUS_PER_HOST
|
||||
|
||||
# ┌── 2. EXECUTE (The Compute) ────────────────────────────────────────
|
||||
@@ -1821,7 +1821,7 @@ The host CPU's role is analogous to an operating system kernel in a traditional
|
||||
|
||||
The host CPU also runs the training framework's Python runtime, which orchestrates kernel launches, manages the computation graph, and coordinates collective communication operations. In well-optimized training loops, the host CPU is pipelining the next batch's data loading and preprocessing while the GPUs are executing the current batch's forward and backward passes, keeping all components busy simultaneously.
|
||||
|
||||
As the "datacenter tax" on host CPUs has grown -- with up to 30% of CPU cycles consumed by network protocol processing, storage virtualization, and security functions -- modern ML nodes increasingly incorporate **Data Processing Units (DPUs)** or SmartNICs. Devices like the NVIDIA BlueField DPU offload these infrastructure tasks to dedicated ARM cores and hardware accelerators integrated into the network adapter itself. By moving the control plane for RDMA, firewall rules, and storage protocols to the DPU, the host CPU recovers nearly all its cycles for the ML pipeline: data loading, tokenization, and kernel orchestration. The DPU also acts as an isolated security domain, allowing cloud providers to maintain control over the network and storage layer via the DPU while granting customers full access to the host CPU and GPUs. For our `{python} gpt3_params_b`B model training, the DPU's offloading of RDMA protocol processing ensures that the host CPU is never on the critical path for gradient synchronization, which flows directly from GPU HBM through the DPU to the InfiniBand fabric without host CPU involvement.
|
||||
As the "datacenter tax" on host CPUs has grown -- with up to 30% of CPU cycles consumed by network protocol processing, storage virtualization, and security functions -- modern ML nodes increasingly incorporate **Data Processing Units (DPUs)** or SmartNICs. Devices like the NVIDIA BlueField DPU offload these infrastructure tasks to dedicated ARM cores and hardware accelerators integrated into the network adapter itself. By moving the control plane for RDMA, firewall rules, and storage protocols to the DPU, the host CPU recovers nearly all its cycles for the ML pipeline: data loading, tokenization, and kernel orchestration. The DPU also acts as an isolated security domain, allowing cloud providers to maintain control over the network and storage layer via the DPU while granting customers full access to the host CPU and GPUs. For our `{python} frontier_params_b`B model training, the DPU's offloading of RDMA protocol processing ensures that the host CPU is never on the critical path for gradient synchronization, which flows directly from GPU HBM through the DPU to the InfiniBand fabric without host CPU involvement.
|
||||
|
||||
#### Alternative Node Architectures
|
||||
|
||||
@@ -1829,7 +1829,7 @@ The DGX architecture is not the only approach to building dense multi-accelerato
|
||||
|
||||
AMD's MI300X represents a particularly interesting alternative. The MI300X is a chiplet-based design that integrates 8 GPU compute dies (XCDs) and 4 I/O dies on a single package, with 192 GB of HBM3 memory providing 5.3 TB/s of aggregate bandwidth. The intra-package communication between XCDs uses AMD's Infinity Fabric, achieving bandwidth comparable to NVLink within the package. For multi-accelerator nodes, AMD uses Infinity Fabric links between packages, similar in concept to NVLink but with different bandwidth characteristics.
|
||||
|
||||
The MI300X's 192 GB of HBM per accelerator is particularly notable. While the H100 offers 80 GB, the MI300X's larger capacity means that a single 8-GPU node can hold 1,536 GB of HBM, enough to fit the full weight tensor of our `{python} gpt3_params_b`B model (350 GB in FP16) with substantial room for optimizer state, gradients, and activations. This capacity advantage can reduce the number of nodes needed for training (fewer data-parallel groups means less inter-node communication) or enable larger batch sizes (more activation memory available), both of which improve scaling efficiency.
|
||||
The MI300X's 192 GB of HBM per accelerator is particularly notable. While the H100 offers 80 GB, the MI300X's larger capacity means that a single 8-GPU node can hold 1,536 GB of HBM, enough to fit the full weight tensor of our `{python} frontier_params_b`B model (350 GB in FP16) with substantial room for optimizer state, gradients, and activations. This capacity advantage can reduce the number of nodes needed for training (fewer data-parallel groups means less inter-node communication) or enable larger batch sizes (more activation memory available), both of which improve scaling efficiency.
|
||||
|
||||
Intel's Gaudi architecture represents yet another approach: Gaudi 2 and Gaudi 3 accelerators use an integrated RDMA-capable Ethernet NIC (RoCE) built directly into each accelerator chip, combined with high-bandwidth on-die interconnects. This allows Gaudi to bypass the need for both NVSwitch and separate InfiniBand HCAs, using a single network fabric for both intra-node and inter-node communication. The trade-off is that the intra-node bandwidth is lower than NVLink (roughly 300 GB/s vs. 900 GB/s), limiting the efficiency of tensor parallelism within a node.
|
||||
|
||||
@@ -1837,7 +1837,7 @@ These alternative architectures illustrate a recurring theme in infrastructure d
|
||||
|
||||
Each approach reflects a different bet about which constraint will be most binding for the target workload. NVIDIA bets on bandwidth (NVLink provides the fastest intra-node fabric). Google bets on systolic efficiency (the MXU achieves higher utilization for regular workloads). AMD bets on memory capacity (192 GB HBM reduces the need for complex memory optimization). Intel bets on network simplicity (integrated Ethernet eliminates the need for separate InfiniBand infrastructure).
|
||||
|
||||
For our `{python} gpt3_params_b`B model, the choice matters primarily for the tensor-parallel portion of the training computation, where the accelerator interconnect is on the critical path. The secondary effects (software ecosystem maturity, supply chain availability, total cost of ownership), however, often dominate the primary performance differences in practice.
|
||||
For our `{python} frontier_params_b`B model, the choice matters primarily for the tensor-parallel portion of the training computation, where the accelerator interconnect is on the critical path. The secondary effects (software ecosystem maturity, supply chain availability, total cost of ownership), however, often dominate the primary performance differences in practice.
|
||||
|
||||
#### Node Health and Reliability
|
||||
|
||||
@@ -1849,15 +1849,15 @@ A well-managed fleet tracks the error rates of each component and preemptively m
|
||||
|
||||
Node-level health monitoring is therefore a critical operational practice. Modern fleet management systems continuously collect telemetry from each GPU (temperature, power draw, ECC error counts, NVLink error rates) and from the host BMC (baseboard management controller). Automated health checkers run short diagnostic workloads on idle nodes to verify that all GPUs, NVLinks, and InfiniBand connections are functioning correctly before the job scheduler assigns training work to that node. Without this proactive monitoring, a silently degraded node can corrupt training gradients (if the error is in the arithmetic path) or slow the entire job (if the error causes NVLink retraining, which temporarily reduces bandwidth). @sec-fault-tolerance-reliability discusses fleet-level fault tolerance strategies in detail.
|
||||
|
||||
For our `{python} gpt3_params_b`B model training across 128 nodes, the probability of at least one node experiencing a hardware issue during a two-week training run is substantial. If each node has an MTBF of 1,000 hours (a reasonable estimate for a DGX H100 under sustained load), the expected number of node failures during a 336-hour training run across 128 nodes is $128 \times 336 / 1{,}000 \approx 43$ failures. This means the training run will experience, on average, roughly three node failures per day. Each failure requires detecting the degraded node, draining its workload, substituting a spare node, and resuming from the last checkpoint -- a process that takes 10--30 minutes with automated tooling. Without spare nodes and automated recovery, these failures would extend the training run by 20--30%, consuming millions of dollars in wasted GPU-hours.
|
||||
For our `{python} frontier_params_b`B model training across 128 nodes, the probability of at least one node experiencing a hardware issue during a two-week training run is substantial. If each node has an MTBF of 1,000 hours (a reasonable estimate for a DGX H100 under sustained load), the expected number of node failures during a 336-hour training run across 128 nodes is $128 \times 336 / 1{,}000 \approx 43$ failures. This means the training run will experience, on average, roughly three node failures per day. Each failure requires detecting the degraded node, draining its workload, substituting a spare node, and resuming from the last checkpoint -- a process that takes 10--30 minutes with automated tooling. Without spare nodes and automated recovery, these failures would extend the training run by 20--30%, consuming millions of dollars in wasted GPU-hours.
|
||||
|
||||
### Node Memory Partitioning {#sec-compute-node-memory}
|
||||
|
||||
Understanding how a large model's memory footprint maps onto a node's physical resources is essential for capacity planning. The memory required to train a model far exceeds the weight storage, and different components of the training state reside in different tiers of the node's memory hierarchy.
|
||||
|
||||
For our `{python} gpt3_params_b`B model, the training memory breaks down as follows:
|
||||
For our `{python} frontier_params_b`B model, the training memory breaks down as follows:
|
||||
|
||||
| **Component** | **Precision** | **Memory per Parameter** | **Total for `{python} gpt3_params_b`B Model** |
|
||||
| **Component** | **Precision** | **Memory per Parameter** | **Total for `{python} frontier_params_b`B Model** |
|
||||
|:-------------------------|:-------------:|:------------------------:|:---------------------------------------------:|
|
||||
| **Model Weights** | FP16 | 2 bytes | 350 GB |
|
||||
| **Gradients** | FP16 | 2 bytes | 350 GB |
|
||||
@@ -1866,9 +1866,9 @@ For our `{python} gpt3_params_b`B model, the training memory breaks down as foll
|
||||
| **Activations** | Mixed | Variable | 100--400 GB |
|
||||
| **Total** | | | **2.2--2.5 TB** |
|
||||
|
||||
: **Training Memory Breakdown for a `{python} gpt3_params_b`B-Parameter Model**. The optimizer state alone (first and second moments in FP32) consumes 4$\times$ the memory of the model weights, making optimizer memory the dominant component of training memory. This is why memory optimization techniques focus heavily on optimizer state sharding and offloading. {#tbl-memory-breakdown}
|
||||
: **Training Memory Breakdown for a `{python} frontier_params_b`B-Parameter Model**. The optimizer state alone (first and second moments in FP32) consumes 4$\times$ the memory of the model weights, making optimizer memory the dominant component of training memory. This is why memory optimization techniques focus heavily on optimizer state sharding and offloading. {#tbl-memory-breakdown}
|
||||
|
||||
The model weights in FP16 occupy 350 GB ($`{python} gpt3_params_b` \times 10^9 \times 2$ bytes). The Adam optimizer maintains two FP32 state tensors per parameter (first moment $m$ and second moment $v$), adding $`{python} gpt3_params_b` \times 10^9 \times 4 \times 2 = 1{,}400$ GB. The gradients in FP16 add another 350 GB. Intermediate activations (needed for the backward pass) depend on the batch size and sequence length but typically require 100--400 GB for practical training configurations. The total memory footprint for training is therefore 2.2--2.5 TB, roughly 6--7$\times$ the raw weight size.
|
||||
The model weights in FP16 occupy 350 GB ($`{python} frontier_params_b` \times 10^9 \times 2$ bytes). The Adam optimizer maintains two FP32 state tensors per parameter (first moment $m$ and second moment $v$), adding $`{python} frontier_params_b` \times 10^9 \times 4 \times 2 = 1{,}400$ GB. The gradients in FP16 add another 350 GB. Intermediate activations (needed for the backward pass) depend on the batch size and sequence length but typically require 100--400 GB for practical training configurations. The total memory footprint for training is therefore 2.2--2.5 TB, roughly 6--7$\times$ the raw weight size.
|
||||
|
||||
A DGX H100 node provides `{python} node_hbm_cap` GB of aggregate HBM across its 8 GPUs, plus 2 TB of host DDR5 DRAM. The HBM capacity alone is insufficient for the full training state. To appreciate why, consider the memory arithmetic for different parallelism strategies. With **pure data parallelism**, each GPU must hold the complete model: 350 GB weights + 350 GB gradients + 700 GB optimizer states = 1,400 GB -- physically impossible on an 80 GB device. Even **ZeRO Stage 3**, which shards all three components across $N$ GPUs, yields 1,400/8 = 175 GB per GPU with 8-way sharding, still more than double the available HBM. The solution is to combine **tensor parallelism** (which splits the model's layers across GPUs, reducing per-GPU weights to 350/8 = 43.75 GB) with **ZeRO-1** (which shards only the optimizer state across the data-parallel dimension). In a configuration with TP-8 within the node and 128-way data parallelism across the cluster, each GPU holds 43.75 GB of weight shards plus approximately 700/128 = 5.5 GB of optimizer state, totaling roughly 50 GB -- comfortably within the 80 GB HBM envelope with 30 GB remaining for activations and temporary buffers. This arithmetic is why the combination of tensor parallelism and optimizer sharding has become the standard approach for frontier model training. Several additional strategies further optimize this memory budget:
|
||||
|
||||
@@ -1895,17 +1895,17 @@ The following table summarizes the memory tiers available within a DGX H100 node
|
||||
|
||||
: **Node Memory Hierarchy**. Each tier trades capacity for bandwidth. The training framework's memory manager must orchestrate data flow across these tiers to fit models whose total state exceeds the aggregate HBM capacity. {#tbl-node-memory}
|
||||
|
||||
This orchestration is complex but essential: without it, training our `{python} gpt3_params_b`B model on currently available hardware would be impossible. @sec-distributed-training-systems examines these memory optimization strategies and their interaction with parallelism in full detail.
|
||||
This orchestration is complex but essential: without it, training our `{python} frontier_params_b`B model on currently available hardware would be impossible. @sec-distributed-training-systems examines these memory optimization strategies and their interaction with parallelism in full detail.
|
||||
|
||||
#### Putting It Together: Memory Budget Exercise
|
||||
|
||||
To solidify the memory planning concepts, consider a concrete sizing exercise for our `{python} gpt3_params_b`B model on a DGX H100 node.
|
||||
To solidify the memory planning concepts, consider a concrete sizing exercise for our `{python} frontier_params_b`B model on a DGX H100 node.
|
||||
|
||||
::: {.callout-example title="Memory Budget for 175B Training on DGX H100"}
|
||||
|
||||
**Given**:
|
||||
|
||||
- Model: `{python} gpt3_params_b`B parameters
|
||||
- Model: `{python} frontier_params_b`B parameters
|
||||
- Node: 8$\times$ H100 GPUs, 80 GB HBM each (640 GB total HBM)
|
||||
- Host: 2 TB DDR5
|
||||
- Parallelism: 8-way tensor parallelism within the node
|
||||
@@ -1981,7 +1981,7 @@ The traditional data path routes every byte through the host CPU: storage contro
|
||||
|
||||
The required prefetch depth is not arbitrary but follows from the statistics of I/O latency variance. If the GPU processes one batch every $T_{\text{compute}}$ seconds and the storage system delivers a batch every $T_{io}$ seconds with standard deviation $\sigma_{io}$, the minimum prefetch depth $k$ required to maintain a 99.7% probability of zero stalls is $k \ge \lceil T_{io} / T_{\text{compute}} + 3\sigma_{io} / T_{\text{compute}} \rceil$. For our 175B model training run where $T_{\text{compute}} = 2.0$ seconds and the storage layer delivers batches at $T_{io} = 1.5 \pm 0.5$ seconds, the required depth is $\lceil 0.75 + 0.75 \rceil = 2$ batches. In production environments where "noisy neighbors" on shared file systems induce heavy tail latencies, engineers typically over-provision this buffer to 4--8 batches to insulate the GPU from the erratic physics of distributed storage. The memory cost of this prefetch buffer (each batch may occupy 100--200 MB of host DRAM) is negligible compared to the cost of a GPU stall.
|
||||
|
||||
As cluster sizes expand, a new bottleneck emerges at the storage layer: the **I/O Wall**. When training our `{python} gpt3_params_b`B model across 128 nodes, the system acts as a synchronized "thundering herd" -- all 128 nodes simultaneously demand the next microbatch of tokens at the start of every training step. A shared parallel filesystem like Lustre, even with 500 GB/s of aggregate throughput, will buckle when 128 clients simultaneously pull data, causing read latencies to spike from milliseconds to seconds. The architectural solution is **tiered storage** with aggressive local caching. By provisioning each training node with local NVMe SSDs capable of delivering 25 GB/s per node, the cluster decouples its immediate data dependency from the shared filesystem. A background process prefetches data from the central store to the local NVMe cache asynchronously, smoothing out I/O spikes. For the 128-node cluster, local NVMe creates an aggregate read bandwidth of 3.2 TB/s ($128\times25$ GB/s), eclipsing the capability of even the most expensive centralized storage arrays and ensuring the GPUs are never starved.
|
||||
As cluster sizes expand, a new bottleneck emerges at the storage layer: the **I/O Wall**. When training our `{python} frontier_params_b`B model across 128 nodes, the system acts as a synchronized "thundering herd" -- all 128 nodes simultaneously demand the next microbatch of tokens at the start of every training step. A shared parallel filesystem like Lustre, even with 500 GB/s of aggregate throughput, will buckle when 128 clients simultaneously pull data, causing read latencies to spike from milliseconds to seconds. The architectural solution is **tiered storage** with aggressive local caching. By provisioning each training node with local NVMe SSDs capable of delivering 25 GB/s per node, the cluster decouples its immediate data dependency from the shared filesystem. A background process prefetches data from the central store to the local NVMe cache asynchronously, smoothing out I/O spikes. For the 128-node cluster, local NVMe creates an aggregate read bandwidth of 3.2 TB/s ($128\times25$ GB/s), eclipsing the capability of even the most expensive centralized storage arrays and ensuring the GPUs are never starved.
|
||||
|
||||
::: {.callout-warning title="The Data Loading Trap"}
|
||||
|
||||
@@ -1989,7 +1989,7 @@ A subtle failure mode occurs when data loading *appears* fast in benchmarks but
|
||||
|
||||
:::
|
||||
|
||||
For our `{python} gpt3_params_b`B model, a single DGX H100 node provides `{python} node_hbm_cap` GB of aggregate HBM, enough to hold the model weights and a portion of the optimizer state. Training, however, requires processing trillions of tokens, and a single node's compute throughput limits training time to months. To complete training in a reasonable timeframe (weeks rather than months), we need tens or hundreds of nodes. Stacking those nodes into a physical enclosure brings us to the next level of infrastructure, where the constraints shift from bandwidth and capacity to raw power and heat.
|
||||
For our `{python} frontier_params_b`B model, a single DGX H100 node provides `{python} node_hbm_cap` GB of aggregate HBM, enough to hold the model weights and a portion of the optimizer state. Training, however, requires processing trillions of tokens, and a single node's compute throughput limits training time to months. To complete training in a reasonable timeframe (weeks rather than months), we need tens or hundreds of nodes. Stacking those nodes into a physical enclosure brings us to the next level of infrastructure, where the constraints shift from bandwidth and capacity to raw power and heat.
|
||||
|
||||
## The Rack {#sec-compute-rack}
|
||||
|
||||
@@ -1999,7 +1999,7 @@ For our `{python} gpt3_params_b`B model, a single DGX H100 node provides `{pytho
|
||||
|
||||
A standard 42U server rack in a traditional datacenter draws 5--10 kW and can be cooled by room-temperature air pushed through perforated floor tiles. Now place four DGX H100 nodes in that same rack: 32 GPUs, each drawing 700 W, plus host CPUs, memory, networking, and power conversion losses. The rack power reaches `{python} rack_power_str` kW, an order of magnitude beyond what traditional datacenter infrastructure was designed to deliver or cool. At this density, the engineering constraints shift from silicon and signal integrity to power delivery and thermodynamics. The rack is where the Power Wall and the laws of heat transfer become the dominant design forces.
|
||||
|
||||
For our `{python} gpt3_params_b`B model, training across 1,024 GPUs requires approximately 128 racks. Each rack dissipates `{python} rack_power_str` kW as heat -- the thermal output of a small industrial furnace. The aggregate power draw of the training cluster approaches 5 MW, enough to power a small town. Delivering this power reliably, converting it efficiently, and removing the resulting heat without allowing any component to exceed its thermal limit is a multi-disciplinary engineering challenge that spans electrical, mechanical, and civil engineering. A failure at any point in the power delivery chain -- from the utility substation to the individual GPU voltage regulator -- can halt the entire training run, wasting hours of computation and potentially corrupting the training state.
|
||||
For our `{python} frontier_params_b`B model, training across 1,024 GPUs requires approximately 128 racks. Each rack dissipates `{python} rack_power_str` kW as heat -- the thermal output of a small industrial furnace. The aggregate power draw of the training cluster approaches 5 MW, enough to power a small town. Delivering this power reliably, converting it efficiently, and removing the resulting heat without allowing any component to exceed its thermal limit is a multi-disciplinary engineering challenge that spans electrical, mechanical, and civil engineering. A failure at any point in the power delivery chain -- from the utility substation to the individual GPU voltage regulator -- can halt the entire training run, wasting hours of computation and potentially corrupting the training state.
|
||||
|
||||
::: {.callout-definition title="Rack"}
|
||||
|
||||
@@ -2086,7 +2086,7 @@ This energy growth rate cannot continue indefinitely without addressing the sour
|
||||
|
||||
Some have invested directly in new renewable energy projects, adding generation capacity to the grid specifically to power their ML workloads. Microsoft, for example, has signed agreements to purchase nuclear energy from restarted reactors, recognizing that the scale and consistency of ML training loads require baseload power sources that renewable intermittent sources alone cannot provide. Google has similarly invested in geothermal energy projects, which provide consistent power output independent of weather conditions.
|
||||
|
||||
The **carbon intensity** of the energy grid dictates the true environmental cost of a training run. A facility powered by hydroelectric dams in the Pacific Northwest emits approximately 50g CO$_2$/kWh, while a coal-heavy grid can produce 400g CO$_2$/kWh or more. A single training run for our `{python} gpt3_params_b`B model, consuming approximately 1,287 MWh, implies a carbon impact ranging from 64 tonnes to 515 tonnes depending solely on location -- an 8$\times$ variance that makes site selection the single most effective tool for decarbonization. This environmental calculus increasingly drives infrastructure decisions: organizations that can locate training clusters in low-carbon regions achieve both lower electricity costs (hydroelectric power is typically cheaper than fossil-fuel generation) and lower carbon footprints, a rare alignment of economic and environmental incentives. @sec-sustainable-ai examines these sustainability considerations in detail, including the carbon accounting for different energy sources and strategies for reducing the environmental footprint of ML infrastructure.
|
||||
The **carbon intensity** of the energy grid dictates the true environmental cost of a training run. A facility powered by hydroelectric dams in the Pacific Northwest emits approximately 50g CO$_2$/kWh, while a coal-heavy grid can produce 400g CO$_2$/kWh or more. A single training run for our `{python} frontier_params_b`B model, consuming approximately 1,287 MWh, implies a carbon impact ranging from 64 tonnes to 515 tonnes depending solely on location -- an 8$\times$ variance that makes site selection the single most effective tool for decarbonization. This environmental calculus increasingly drives infrastructure decisions: organizations that can locate training clusters in low-carbon regions achieve both lower electricity costs (hydroelectric power is typically cheaper than fossil-fuel generation) and lower carbon footprints, a rare alignment of economic and environmental incentives. @sec-sustainable-ai examines these sustainability considerations in detail, including the carbon accounting for different energy sources and strategies for reducing the environmental footprint of ML infrastructure.
|
||||
|
||||
### Cooling {#sec-compute-cooling}
|
||||
|
||||
@@ -2148,7 +2148,7 @@ The operational procedures for immersion-cooled facilities differ fundamentally
|
||||
|
||||
: **The Shift to Liquid Cooling**. At rack power densities above 30 kW, air cooling requires fan power that approaches the power consumed by the GPUs themselves. Liquid cooling is not a premium option; it is a thermodynamic requirement for modern ML racks. {#tbl-cooling-limits}
|
||||
|
||||
The capital cost of these cooling technologies spans an order of magnitude. Standard air cooling infrastructure costs \$2,000--5,000 per rack (fans, CRAC units, raised floor tiles). Direct-to-chip liquid cooling costs \$15,000--25,000 per rack (cold plates, manifolds, CDUs, piping). Full immersion cooling costs \$30,000--50,000 per tank (dielectric fluid, sealed tanks, specialized heat exchangers). The break-even analysis between air and liquid cooling depends on rack power density: at 20 kW per rack, air cooling's lower CapEx wins over a 3-year lifecycle. At 40 kW per rack, the electricity savings from liquid cooling's lower PUE (1.08 vs. 1.5) offset the higher CapEx within 18--24 months. At 60+ kW per rack -- the regime of modern ML infrastructure -- air cooling is physically impossible, making the comparison moot. For our `{python} gpt3_params_b`B model's 128-rack training cluster at `{python} rack_power_str` kW per rack, direct-to-chip liquid cooling is the standard choice, balancing density, serviceability, and cost. Immersion cooling offers marginal PUE improvement (1.03 vs. 1.08) but introduces operational complexity that most organizations find unjustified at current rack densities.
|
||||
The capital cost of these cooling technologies spans an order of magnitude. Standard air cooling infrastructure costs \$2,000--5,000 per rack (fans, CRAC units, raised floor tiles). Direct-to-chip liquid cooling costs \$15,000--25,000 per rack (cold plates, manifolds, CDUs, piping). Full immersion cooling costs \$30,000--50,000 per tank (dielectric fluid, sealed tanks, specialized heat exchangers). The break-even analysis between air and liquid cooling depends on rack power density: at 20 kW per rack, air cooling's lower CapEx wins over a 3-year lifecycle. At 40 kW per rack, the electricity savings from liquid cooling's lower PUE (1.08 vs. 1.5) offset the higher CapEx within 18--24 months. At 60+ kW per rack -- the regime of modern ML infrastructure -- air cooling is physically impossible, making the comparison moot. For our `{python} frontier_params_b`B model's 128-rack training cluster at `{python} rack_power_str` kW per rack, direct-to-chip liquid cooling is the standard choice, balancing density, serviceability, and cost. Immersion cooling offers marginal PUE improvement (1.03 vs. 1.08) but introduces operational complexity that most organizations find unjustified at current rack densities.
|
||||
|
||||
::: {.callout-notebook title="The Cooling Tax"}
|
||||
|
||||
@@ -2174,7 +2174,7 @@ The viability of waste heat reuse depends on the proximity of heat consumers. Ur
|
||||
|
||||
@sec-sustainable-ai examines the environmental implications of datacenter cooling in detail, including the carbon accounting for waste heat reuse and the life-cycle analysis of different cooling technologies.
|
||||
|
||||
For our `{python} gpt3_params_b`B model training cluster, the choice between cooling technologies is not optional. A cluster of 1,000 H100s dissipates 700 kW of heat from the GPUs alone, before accounting for CPUs, memory, networking, and power conversion losses. Only liquid cooling can remove this heat at the required density. The rack is the level at which the problem shifts from *computation* to *physics*, and the design of the cooling infrastructure often determines whether a training cluster can operate at full utilization or must be throttled to prevent thermal runaway.
|
||||
For our `{python} frontier_params_b`B model training cluster, the choice between cooling technologies is not optional. A cluster of 1,000 H100s dissipates 700 kW of heat from the GPUs alone, before accounting for CPUs, memory, networking, and power conversion losses. Only liquid cooling can remove this heat at the required density. The rack is the level at which the problem shifts from *computation* to *physics*, and the design of the cooling infrastructure often determines whether a training cluster can operate at full utilization or must be throttled to prevent thermal runaway.
|
||||
|
||||
#### Cooling System Reliability
|
||||
|
||||
@@ -2231,14 +2231,14 @@ A team is planning to deploy 256 H100 GPUs (32 nodes) in an existing air-cooled
|
||||
|
||||
:::
|
||||
|
||||
The rack concentrates power and heat into a physical volume where thermodynamics, not software, sets the limits. A single rack of 32 GPUs, however, is far from sufficient for our `{python} gpt3_params_b`B model, which may require thousands of accelerators to train in a reasonable timeframe. The next level of infrastructure aggregates hundreds of racks into a unified computing system: the pod.
|
||||
The rack concentrates power and heat into a physical volume where thermodynamics, not software, sets the limits. A single rack of 32 GPUs, however, is far from sufficient for our `{python} frontier_params_b`B model, which may require thousands of accelerators to train in a reasonable timeframe. The next level of infrastructure aggregates hundreds of racks into a unified computing system: the pod.
|
||||
|
||||
## The Pod {#sec-compute-pod}
|
||||
|
||||
\index{Pod}
|
||||
\index{Warehouse-Scale Computer}
|
||||
|
||||
Training our `{python} gpt3_params_b`B model on a single DGX H100 node (8 GPUs, roughly 16,000 TFLOPS aggregate) would take several months, assuming we can fit the model at all. Reducing training time to weeks requires 100--1,000 nodes operating in concert, and doing so demands that these nodes be wired together into a network fast enough to keep gradient synchronization from becoming the bottleneck. This is the engineering challenge of the **pod**: aggregating hundreds of racks into a single, coordinated computing system where the network fabric serves the same role that the system bus serves within a single machine.
|
||||
Training our `{python} frontier_params_b`B model on a single DGX H100 node (8 GPUs, roughly 16,000 TFLOPS aggregate) would take several months, assuming we can fit the model at all. Reducing training time to weeks requires 100--1,000 nodes operating in concert, and doing so demands that these nodes be wired together into a network fast enough to keep gradient synchronization from becoming the bottleneck. This is the engineering challenge of the **pod**: aggregating hundreds of racks into a single, coordinated computing system where the network fabric serves the same role that the system bus serves within a single machine.
|
||||
|
||||
The scale of this challenge is worth appreciating concretely. A 1,024-node DGX H100 cluster contains 8,192 GPUs, 4,096 NVSwitch chips, 8,192 InfiniBand HCAs, several hundred InfiniBand switches, tens of thousands of cables, and consumes approximately 7--10 MW of power. It occupies roughly 250 racks across one or more datacenter halls.
|
||||
|
||||
@@ -2246,7 +2246,7 @@ The physical weight of such a cluster is also substantial. Each DGX H100 node we
|
||||
|
||||
The physical layout of the datacenter hall reflects these density constraints. The extreme power density of ML training racks necessitates rigid hot aisle/cold aisle containment in air-cooled sections, where cold air is forced into the enclosed front of the rack and waste heat is captured immediately at the rear exhaust. The "spine" of the hall -- the central cable corridor connecting all racks to the aggregation switches -- must accommodate thousands of fiber optic cables and power feeds. Overhead cable trays are preferred over under-floor routing to improve airflow and accessibility, carrying the heavy copper power feeds and fragile fiber interconnects that form the nervous system of the cluster. The layout must also accommodate the liquid cooling infrastructure: CDU placement, coolant piping runs, and isolation valves that allow individual racks to be serviced without draining the entire cooling loop. These physical layout decisions, made during facility design, constrain the network topology options available to the training team years later.
|
||||
|
||||
Training a `{python} gpt3_params_b`B model on this cluster requires `{python} pod_flops_math` total floating-point operations (assuming 300 billion training tokens). At 45 percent MFU -- a strong but achievable figure for well-tuned large-model training -- the cluster sustains `{python} cluster_throughput_math` EFLOPS, yielding a physics-limit training time of approximately 12 hours. This is the absolute floor: every GPU computing useful FLOPs at 45 percent of peak, with zero communication overhead, zero pipeline bubbles, and zero downtime.
|
||||
Training a `{python} frontier_params_b`B model on this cluster requires `{python} pod_flops_math` total floating-point operations (assuming 300 billion training tokens). At 45 percent MFU -- a strong but achievable figure for well-tuned large-model training -- the cluster sustains `{python} cluster_throughput_math` EFLOPS, yielding a physics-limit training time of approximately 12 hours. This is the absolute floor: every GPU computing useful FLOPs at 45 percent of peak, with zero communication overhead, zero pipeline bubbles, and zero downtime.
|
||||
|
||||
In practice, communication overhead, pipeline bubbles, checkpoint I/O, hardware failures, and maintenance windows compound to extend the actual wall-clock time to 2--4 weeks. The gap between the 12-hour physics limit and the practical 2--4 week schedule is a factor of 15--25$\times$ (as the callout below quantifies step by step), illustrating the central theme of this section: at pod scale, the infrastructure's imperfections dominate over the raw capability of the silicon. Recovering this lost factor is the central challenge of distributed systems engineering, and the solutions span hardware (better networks), software (overlapped communication), and operations (proactive maintenance to minimize downtime).
|
||||
|
||||
@@ -2263,7 +2263,7 @@ In practice, communication overhead, pipeline bubbles, checkpoint I/O, hardware
|
||||
# │ Show: ~12h physics limit → ~18h system minimum → ~2 weeks operations.
|
||||
# │ How: 6NP / (N_gpus * peak * util); apply overhead multipliers sequentially.
|
||||
# │
|
||||
# │ Imports: mlsys.constants (GPT3_PARAMS, H100_FLOPS_FP16_TENSOR, ...)
|
||||
# │ Imports: mlsys.constants (Models.GPT3.parameters, system.peak_flops, ...)
|
||||
# │ Exports: t_physics_hr_str, t_comm_hr_str, t_bubble_hr_str, t_ckpt_hr_str,
|
||||
# │ t_fail_hr_str, t_maint_hr_str, t_total_week_str
|
||||
# └─────────────────────────────────────────────────────────────────────────────
|
||||
@@ -2275,10 +2275,10 @@ class TrainingTime175B:
|
||||
Quantifies the gap between silicon capability and fleet productivity.
|
||||
"""
|
||||
# ┌── 1. LOAD (Constants) ───────────────────────────────────────────────
|
||||
n_params = GPT3_PARAMS.m_as('param')
|
||||
n_params = Models.GPT3.parameters.m_as('param')
|
||||
n_tokens = 300 * BILLION
|
||||
n_gpus = 8192
|
||||
peak_flops = H100_FLOPS_FP16_TENSOR.m_as(flop/second)
|
||||
peak_flops = system.peak_flops.m_as(flop/second)
|
||||
mfu = 0.45
|
||||
|
||||
# Multipliers
|
||||
@@ -2327,7 +2327,7 @@ t_total_week_str = TrainingTime175B.t_total_week_str
|
||||
|
||||
::: {.callout-notebook title="Training Time for 175B"}
|
||||
|
||||
We can derive the training time for our `{python} gpt3_params_b`B model from first principles.
|
||||
We can derive the training time for our `{python} frontier_params_b`B model from first principles.
|
||||
|
||||
1. **Total FLOPs**: Using the approximation $6 \times N \times P$, where `{python} n_params_math` and $P = 300 \times 10^9$ tokens:
|
||||
`{python} total_flops_math` FLOPs
|
||||
@@ -2379,7 +2379,7 @@ This perspective shift has practical implications for how infrastructure teams t
|
||||
|
||||
The first principle is that **the network is the bottleneck, not the compute**. Within a single node, NVLink provides enough bandwidth for tensor parallelism. Across nodes, the network fabric must carry gradient tensors, activation checkpoints, and pipeline stage outputs.
|
||||
|
||||
For our `{python} gpt3_params_b`B model with data parallelism across 128 nodes, each training step requires an AllReduce of approximately 350 GB of gradients. If the network *cannot* overlap this communication with the next forward pass, the GPUs sit idle during synchronization.
|
||||
For our `{python} frontier_params_b`B model with data parallelism across 128 nodes, each training step requires an AllReduce of approximately 350 GB of gradients. If the network *cannot* overlap this communication with the next forward pass, the GPUs sit idle during synchronization.
|
||||
|
||||
The fraction of time spent on communication (as opposed to computation) is the single largest determinant of cluster efficiency, often mattering more than the peak FLOPS of the individual accelerators. This is a counterintuitive result for teams accustomed to thinking about hardware in terms of compute specifications. A cluster with slightly slower GPUs but a better network fabric can outperform a cluster with faster GPUs but an inadequate network, because the faster GPUs spend more of their time idle, waiting for gradient synchronization.
|
||||
|
||||
@@ -2389,7 +2389,7 @@ The second principle is that **failure is routine, not exceptional**. A cluster
|
||||
|
||||
\index{MTBF}
|
||||
|
||||
To put this in perspective: if training our `{python} gpt3_params_b`B model takes 2 weeks (336 hours) on a 10,000-GPU cluster, we should expect approximately 336 individual GPU failures during the training run. Each failure potentially corrupts the training state and requires either rolling back to a checkpoint or reorganizing the cluster to work around the failed node.
|
||||
To put this in perspective: if training our `{python} frontier_params_b`B model takes 2 weeks (336 hours) on a 10,000-GPU cluster, we should expect approximately 336 individual GPU failures during the training run. Each failure potentially corrupts the training state and requires either rolling back to a checkpoint or reorganizing the cluster to work around the failed node.
|
||||
|
||||
The checkpointing frequency determines the cost of each failure. If checkpoints are taken every 10 minutes, a failure costs at most 10 minutes of lost work plus the time to restart. If checkpoints are taken every hour, each failure costs up to an hour of lost work plus restart overhead, and with 336 failures, the cumulative lost time could exceed 336 hours, *doubling the training time*.
|
||||
|
||||
@@ -2399,7 +2399,7 @@ This requires redundancy at every level of the physical infrastructure. Power fe
|
||||
|
||||
The interaction between failure rate and checkpointing frequency creates an optimization problem. More frequent checkpoints reduce the amount of work lost per failure but consume GPU time (to serialize the training state) and storage bandwidth (to write the checkpoint). Less frequent checkpoints reduce overhead but increase the expected work lost per failure. @sec-fault-tolerance-reliability examines checkpointing strategies and elastic training systems in detail.
|
||||
|
||||
Checkpointing also creates a significant *storage* requirement that demands its own dedicated infrastructure. A full checkpoint for our `{python} gpt3_params_b`B model includes the model weights (350 GB), optimizer states (1,400 GB), and the random number generator states needed to resume training deterministically. The total is approximately 1.75 TB per checkpoint. To keep the checkpoint overhead below 2% of total training time, this write operation must complete in under 30 seconds, demanding a sustained aggregate write bandwidth of approximately 60 GB/s. Standard NFS or object storage implementations collapse under this "thundering herd" pattern, where all GPUs transition from compute to I/O simultaneously. The solution requires a high-performance parallel file system like Lustre or IBM Spectrum Scale (GPFS), architected with hundreds of Object Storage Targets (OSTs) striping data across thousands of NVMe drives. A failure to architect for this burst bandwidth results in "I/O jitter," where training hangs unpredictably for minutes as the checkpoint write saturates the storage backend, effectively burning millions of dollars in idle GPU cycles. @sec-data-storage examines the storage architectures that meet these throughput requirements in detail. If checkpoints are taken every 10 minutes during a 2-week training run, the system must write:
|
||||
Checkpointing also creates a significant *storage* requirement that demands its own dedicated infrastructure. A full checkpoint for our `{python} frontier_params_b`B model includes the model weights (350 GB), optimizer states (1,400 GB), and the random number generator states needed to resume training deterministically. The total is approximately 1.75 TB per checkpoint. To keep the checkpoint overhead below 2% of total training time, this write operation must complete in under 30 seconds, demanding a sustained aggregate write bandwidth of approximately 60 GB/s. Standard NFS or object storage implementations collapse under this "thundering herd" pattern, where all GPUs transition from compute to I/O simultaneously. The solution requires a high-performance parallel file system like Lustre or IBM Spectrum Scale (GPFS), architected with hundreds of Object Storage Targets (OSTs) striping data across thousands of NVMe drives. A failure to architect for this burst bandwidth results in "I/O jitter," where training hangs unpredictably for minutes as the checkpoint write saturates the storage backend, effectively burning millions of dollars in idle GPU cycles. @sec-data-storage examines the storage architectures that meet these throughput requirements in detail. If checkpoints are taken every 10 minutes during a 2-week training run, the system must write:
|
||||
|
||||
- **Checkpoints per run**: (14 days$\times$ 24 hours$\times$ 6 per hour) = 2,016 checkpoints
|
||||
- **Total checkpoint data**: $2,016\times1.75$ TB = 3,528 TB
|
||||
@@ -2435,7 +2435,7 @@ Network topology optimization also benefits from homogeneity. If every node runs
|
||||
|
||||
The cost of this homogeneity is inflexibility. A pod designed for Transformer training with 8-way tensor parallelism may be poorly suited for Mixture-of-Experts models that require AllToAll communication, or for recommendation models that need large CPU memory pools for embedding tables. Organizations that run diverse workloads must either accept the generality tax of a flexible topology or maintain multiple specialized pods, each optimized for a different workload class. The choice mirrors the accelerator spectrum: specialization yields efficiency, generality yields flexibility, and the optimal point depends on workload stability.
|
||||
|
||||
The software infrastructure of a WSC functions less like a collection of servers and more like a single distributed operating system, with the **job scheduler** acting as the kernel. Systems like Slurm or Kubernetes do not merely assign tasks; they orchestrate the rigid resource geometry required for distributed training. The critical constraint is **gang scheduling**: a training job for our `{python} gpt3_params_b`B model requiring 1,024 GPUs cannot start if only 1,023 are available. The scheduler must allocate the entire cohort simultaneously, because the synchronous nature of the training algorithm means a single missing node halts the progress of the entire fleet. Beyond binary allocation, the scheduler must enforce **topology awareness**, placing the job on a contiguous block of high-speed interconnects to minimize cross-sectional bandwidth bottlenecks. Allocating 1,024 GPUs scattered randomly across the datacenter introduces hop-count penalties that can reduce effective AllReduce bandwidth by 30--50%. In a multi-tenant environment, this creates complex queue management challenges where high-priority jobs may need to preempt lower-priority experiments, triggering automated checkpointing and eviction to free up the required contiguous hardware blocks. @sec-fleet-orchestration examines the scheduling algorithms and resource management strategies for WSC-scale clusters in detail.
|
||||
The software infrastructure of a WSC functions less like a collection of servers and more like a single distributed operating system, with the **job scheduler** acting as the kernel. Systems like Slurm or Kubernetes do not merely assign tasks; they orchestrate the rigid resource geometry required for distributed training. The critical constraint is **gang scheduling**: a training job for our `{python} frontier_params_b`B model requiring 1,024 GPUs cannot start if only 1,023 are available. The scheduler must allocate the entire cohort simultaneously, because the synchronous nature of the training algorithm means a single missing node halts the progress of the entire fleet. Beyond binary allocation, the scheduler must enforce **topology awareness**, placing the job on a contiguous block of high-speed interconnects to minimize cross-sectional bandwidth bottlenecks. Allocating 1,024 GPUs scattered randomly across the datacenter introduces hop-count penalties that can reduce effective AllReduce bandwidth by 30--50%. In a multi-tenant environment, this creates complex queue management challenges where high-priority jobs may need to preempt lower-priority experiments, triggering automated checkpointing and eviction to free up the required contiguous hardware blocks. @sec-fleet-orchestration examines the scheduling algorithms and resource management strategies for WSC-scale clusters in detail.
|
||||
|
||||
### Scaling Efficiency {#sec-compute-scaling-efficiency}
|
||||
|
||||
@@ -2455,7 +2455,7 @@ where $T_N$ is the training time on $N$ nodes. An efficiency of 1.0 means perfec
|
||||
|
||||
:::
|
||||
|
||||
For data-parallel training of our `{python} gpt3_params_b`B model, the communication cost per step is dominated by the AllReduce of 350 GB of gradients. Using ring-AllReduce over InfiniBand at `{python} ib_bw_gbs` GB/s effective bandwidth, the communication time is approximately `{python} comm_time_formula_math`, which for large $N$ approaches `{python} comm_approx_math` seconds. If the compute time per step is 20 seconds, the total step time is 34 seconds, yielding a scaling efficiency of $20/34 \approx 0.59$. In practice, communication can be overlapped with the backward pass (sending gradients for early layers while computing gradients for later layers), which recovers much of this loss, achieving 70--90% scaling efficiency for well-optimized training systems.
|
||||
For data-parallel training of our `{python} frontier_params_b`B model, the communication cost per step is dominated by the AllReduce of 350 GB of gradients. Using ring-AllReduce over InfiniBand at `{python} ib_bw_gbs` GB/s effective bandwidth, the communication time is approximately `{python} comm_time_formula_math`, which for large $N$ approaches `{python} comm_approx_math` seconds. If the compute time per step is 20 seconds, the total step time is 34 seconds, yielding a scaling efficiency of $20/34 \approx 0.59$. In practice, communication can be overlapped with the backward pass (sending gradients for early layers while computing gradients for later layers), which recovers much of this loss, achieving 70--90% scaling efficiency for well-optimized training systems.
|
||||
|
||||
The scaling efficiency depends critically on the ratio of computation to communication. This ratio is determined by three factors:
|
||||
|
||||
@@ -2467,11 +2467,11 @@ The scaling efficiency depends critically on the ratio of computation to communi
|
||||
|
||||
These three factors interact in important ways. Larger models with larger batch sizes achieve better scaling efficiency, which means that frontier-scale training runs (the most expensive workloads) are also the ones that benefit most from scale. This creates a virtuous cycle for large-scale infrastructure: the workloads that justify building thousand-GPU clusters are also the workloads that use them most efficiently. Conversely, small models and small batch sizes scale poorly, which is why researchers training 1B-parameter models on 64 GPUs often achieve only 40--60% scaling efficiency.
|
||||
|
||||
However, even for large models, scaling does not continue indefinitely. There exists a **scaling cliff** beyond which adding more GPUs actually reduces cost-efficiency. For our `{python} gpt3_params_b`B model, the optimal cluster size is approximately 1,024--4,096 GPUs, where the communication-to-compute ratio remains favorable and scaling efficiency stays above 70%. Beyond 8,192 GPUs, the AllReduce communication time begins to dominate the backward pass computation time, and the efficiency drops below 50%. While the wall-clock training time may still decrease slightly with more GPUs, the *cost per useful FLOP* increases -- the organization is paying for 8,000 GPUs to do the work of 4,000. This non-linear relationship dictates that the economic viability of training frontier models is bounded not just by hardware availability but by the physics of interconnect latency. The cluster must be sized to operate in the linear regime of the scaling curve, and the model architecture (batch size, sequence length, parallelism dimensions) must be co-designed with the cluster size to maintain this balance.
|
||||
However, even for large models, scaling does not continue indefinitely. There exists a **scaling cliff** beyond which adding more GPUs actually reduces cost-efficiency. For our `{python} frontier_params_b`B model, the optimal cluster size is approximately 1,024--4,096 GPUs, where the communication-to-compute ratio remains favorable and scaling efficiency stays above 70%. Beyond 8,192 GPUs, the AllReduce communication time begins to dominate the backward pass computation time, and the efficiency drops below 50%. While the wall-clock training time may still decrease slightly with more GPUs, the *cost per useful FLOP* increases -- the organization is paying for 8,000 GPUs to do the work of 4,000. This non-linear relationship dictates that the economic viability of training frontier models is bounded not just by hardware availability but by the physics of interconnect latency. The cluster must be sized to operate in the linear regime of the scaling curve, and the model architecture (batch size, sequence length, parallelism dimensions) must be co-designed with the cluster size to maintain this balance.
|
||||
|
||||
::: {.callout-notebook title="Scaling Efficiency for a 175B Model"}
|
||||
|
||||
**Setup**: Training a `{python} gpt3_params_b`B model on a DGX H100 cluster with `{python} ib_bw_str` Gbps InfiniBand per GPU.
|
||||
**Setup**: Training a `{python} frontier_params_b`B model on a DGX H100 cluster with `{python} ib_bw_str` Gbps InfiniBand per GPU.
|
||||
|
||||
- **Compute per step** (assuming batch size 2M tokens, 6 FLOPs per parameter per token):
|
||||
`{python} compute_per_step_math` FLOPs
|
||||
@@ -2494,7 +2494,7 @@ This low efficiency (37.5%) shows why naive data parallelism at this scale is in
|
||||
|
||||
The scaling efficiency analysis reveals a deeper insight: the optimal parallelism strategy is not determined by the model architecture alone but by the *interaction* between the model's communication requirements and the infrastructure's bandwidth hierarchy. Each combination of parallelism strategy and infrastructure topology produces a different scaling efficiency curve, and selecting the wrong combination can waste a significant fraction of the cluster's capacity.
|
||||
|
||||
To illustrate this interaction concretely, consider three parallelism configurations for training our `{python} gpt3_params_b`B model on a 1,024-GPU cluster organized as 128 nodes of 8 GPUs each.
|
||||
To illustrate this interaction concretely, consider three parallelism configurations for training our `{python} frontier_params_b`B model on a 1,024-GPU cluster organized as 128 nodes of 8 GPUs each.
|
||||
|
||||
Configuration A: Pure Data Parallelism (DP-1024). All 1,024 GPUs replicate the full model (using ZeRO to shard optimizer states), and each GPU processes a different data shard. The gradient AllReduce exchanges 350 GB across the full InfiniBand fabric. As the napkin math above showed, this achieves approximately 37.5% efficiency because the inter-node communication dominates.
|
||||
|
||||
@@ -2545,11 +2545,11 @@ Fleet operators have learned, often through costly experience, that proactive ma
|
||||
|
||||
:::
|
||||
|
||||
In multi-tenant clusters where multiple training jobs share the same physical infrastructure, the **noisy neighbor** problem introduces a performance hazard that is invisible to individual job metrics. While containerization strictly limits CPU and memory usage, the network fabric is often a shared resource susceptible to interference. If Job A initiates a massive AllReduce operation across the spine switches just as Job B attempts to fetch training data from networked storage, the resulting micro-bursts of packet contention can throttle Job B's throughput by 30--40%. This interference is particularly pernicious in RDMA-enabled clusters where traffic bypasses the host CPU, rendering standard OS-level packet scheduling ineffective. Modern orchestration mitigates this via **static rail alignment** -- physically dedicating specific InfiniBand subnets to specific jobs -- or by deploying congestion notification protocols that throttle aggressive flows at the switch hardware level. For organizations running our `{python} gpt3_params_b`B model training alongside smaller research experiments, the safest approach is to physically partition the cluster into isolated "islands" with dedicated network fabrics, accepting the utilization penalty of fragmentation in exchange for performance predictability.
|
||||
In multi-tenant clusters where multiple training jobs share the same physical infrastructure, the **noisy neighbor** problem introduces a performance hazard that is invisible to individual job metrics. While containerization strictly limits CPU and memory usage, the network fabric is often a shared resource susceptible to interference. If Job A initiates a massive AllReduce operation across the spine switches just as Job B attempts to fetch training data from networked storage, the resulting micro-bursts of packet contention can throttle Job B's throughput by 30--40%. This interference is particularly pernicious in RDMA-enabled clusters where traffic bypasses the host CPU, rendering standard OS-level packet scheduling ineffective. Modern orchestration mitigates this via **static rail alignment** -- physically dedicating specific InfiniBand subnets to specific jobs -- or by deploying congestion notification protocols that throttle aggressive flows at the switch hardware level. For organizations running our `{python} frontier_params_b`B model training alongside smaller research experiments, the safest approach is to physically partition the cluster into isolated "islands" with dedicated network fabrics, accepting the utilization penalty of fragmentation in exchange for performance predictability.
|
||||
|
||||
The reliability challenge is formalized by the **checkpoint-compute trade-off**, which balances the time lost to saving state against the time lost to recomputing work after a failure. The **Young/Daly formula** defines the optimal checkpoint interval $\tau$ as $\sqrt{2\delta M}$, where $\delta$ is the time to write a checkpoint and $M$ is the Mean Time Between Failures. For our `{python} gpt3_params_b`B model, the full training state -- weights, optimizer moments, and gradients -- approaches 2 TB. Even with a high-performance parallel filesystem capable of 100 GB/s write throughput, committing this state to persistent storage takes approximately $\delta \approx 20$ seconds. In a cluster of 10,000 GPUs where the system-wide MTBF is approximately 1 hour, the formula yields an optimal checkpoint interval of roughly 8.5 minutes. This creates a relentless cadence: every 8.5 minutes, the entire cluster pauses for 20 seconds to serialize its state to disk, introducing a sustained training overhead of approximately 4%. If the interval is too long, the cost of recomputing lost work after a failure exceeds the savings from fewer checkpoints; if too short, the I/O overhead dominates the training budget. @sec-fault-tolerance-reliability examines checkpointing strategies, including asynchronous checkpointing and incremental state saving, that reduce this overhead.
|
||||
The reliability challenge is formalized by the **checkpoint-compute trade-off**, which balances the time lost to saving state against the time lost to recomputing work after a failure. The **Young/Daly formula** defines the optimal checkpoint interval $\tau$ as $\sqrt{2\delta M}$, where $\delta$ is the time to write a checkpoint and $M$ is the Mean Time Between Failures. For our `{python} frontier_params_b`B model, the full training state -- weights, optimizer moments, and gradients -- approaches 2 TB. Even with a high-performance parallel filesystem capable of 100 GB/s write throughput, committing this state to persistent storage takes approximately $\delta \approx 20$ seconds. In a cluster of 10,000 GPUs where the system-wide MTBF is approximately 1 hour, the formula yields an optimal checkpoint interval of roughly 8.5 minutes. This creates a relentless cadence: every 8.5 minutes, the entire cluster pauses for 20 seconds to serialize its state to disk, introducing a sustained training overhead of approximately 4%. If the interval is too long, the cost of recomputing lost work after a failure exceeds the savings from fewer checkpoints; if too short, the I/O overhead dominates the training budget. @sec-fault-tolerance-reliability examines checkpointing strategies, including asynchronous checkpointing and incremental state saving, that reduce this overhead.
|
||||
|
||||
The choice of network topology is a direct reflection of the dominant communication patterns in the distributed training workload. A **fat-tree** topology offers full bisection bandwidth and non-blocking any-to-any communication, making it theoretically ideal for algorithms like AllReduce and AllToAll. However, the cost of the upper-tier spine switches makes it expensive at scale. A **torus or mesh** topology provides excellent nearest-neighbor bandwidth at a fraction of the switch cost but suffers severe congestion penalties for global traffic patterns. A **rail-optimized** topology physically wires the network so that each GPU rank connects to a dedicated leaf switch, maximizing the efficiency of structured AllReduce within parallelism groups. For our `{python} gpt3_params_b`B model configured with TP-8, PP-4, and DP-32, the rail-optimized topology is the natural match: the bandwidth-heavy tensor parallelism traffic is entirely contained within the NVLink domain of a single node, the pipeline parallelism traffic flows between adjacent nodes requiring only moderate point-to-point bandwidth, and the data-parallel gradient reduction happens across the full fleet but with reduced volume (each DP group synchronizes only 1/32 of the total gradients). The rail-optimized design physically matches this communication structure, placing the nodes that communicate most frequently on the same switch.
|
||||
The choice of network topology is a direct reflection of the dominant communication patterns in the distributed training workload. A **fat-tree** topology offers full bisection bandwidth and non-blocking any-to-any communication, making it theoretically ideal for algorithms like AllReduce and AllToAll. However, the cost of the upper-tier spine switches makes it expensive at scale. A **torus or mesh** topology provides excellent nearest-neighbor bandwidth at a fraction of the switch cost but suffers severe congestion penalties for global traffic patterns. A **rail-optimized** topology physically wires the network so that each GPU rank connects to a dedicated leaf switch, maximizing the efficiency of structured AllReduce within parallelism groups. For our `{python} frontier_params_b`B model configured with TP-8, PP-4, and DP-32, the rail-optimized topology is the natural match: the bandwidth-heavy tensor parallelism traffic is entirely contained within the NVLink domain of a single node, the pipeline parallelism traffic flows between adjacent nodes requiring only moderate point-to-point bandwidth, and the data-parallel gradient reduction happens across the full fleet but with reduced volume (each DP group synchronizes only 1/32 of the total gradients). The rail-optimized design physically matches this communication structure, placing the nodes that communicate most frequently on the same switch.
|
||||
|
||||
### Fleet Architecture Case Studies {#sec-compute-fleet-case-studies}
|
||||
|
||||
@@ -2565,7 +2565,7 @@ The fat-tree's strength is its flexibility. Whether the training framework uses
|
||||
|
||||
The fat-tree also simplifies job scheduling significantly. Because any subset of nodes can communicate efficiently with any other subset, the scheduler can place jobs on whatever nodes are available without worrying about locality constraints. This scheduling flexibility improves cluster utilization because jobs do not need to wait for a specific set of contiguous nodes to become available. In a torus topology, by contrast, a training job that requires 64 nodes works most efficiently when those nodes form a compact sub-torus within the larger topology, which constrains the scheduler and can leave nodes idle even when the cluster has sufficient total capacity.
|
||||
|
||||
For our `{python} gpt3_params_b`B model, a SuperPOD configuration of 128 DGX H100 nodes (1,024 GPUs) connected via a two-tier InfiniBand fat-tree provides the baseline training platform. The fat-tree's full bisection bandwidth ensures that the data-parallel AllReduce of 350 GB of gradients across 128 nodes proceeds at the full InfiniBand line rate, regardless of which specific nodes are assigned to the job. This topology flexibility is particularly valuable when nodes are periodically quarantined for maintenance: the scheduler simply substitutes healthy nodes from elsewhere in the fabric without any performance penalty, maintaining training continuity.
|
||||
For our `{python} frontier_params_b`B model, a SuperPOD configuration of 128 DGX H100 nodes (1,024 GPUs) connected via a two-tier InfiniBand fat-tree provides the baseline training platform. The fat-tree's full bisection bandwidth ensures that the data-parallel AllReduce of 350 GB of gradients across 128 nodes proceeds at the full InfiniBand line rate, regardless of which specific nodes are assigned to the job. This topology flexibility is particularly valuable when nodes are periodically quarantined for maintenance: the scheduler simply substitutes healthy nodes from elsewhere in the fabric without any performance penalty, maintaining training continuity.
|
||||
|
||||
The trade-off is cost and complexity. A fat-tree requires a large number of expensive InfiniBand switches (each costing $15,000--30,000), and the switch count grows super-linearly with the number of endpoints. For a 1,024-GPU cluster, the switch fabric alone can cost $10--20 million, representing 10--15% of total system cost. Cable management also becomes formidable: a three-tier fat-tree for 1,024 GPUs requires thousands of individual cables, each of which must be precisely routed, labeled, and tested. @sec-network-fabrics examines the fat-tree topology and its alternatives in full quantitative detail.
|
||||
|
||||
@@ -2613,11 +2613,11 @@ The D1 chip's architecture is radically different from both GPUs and TPUs. Inste
|
||||
|
||||
Twenty-five D1 chips are assembled into a "training tile," with the chips arranged in a $5\times5$ grid and connected via inter-chip links that extend the 2D mesh seamlessly across chip boundaries. Multiple tiles aggregate into an ExaPOD, creating a system with hundreds of thousands of cores operating as a single large mesh. The architecture is optimized for the specific communication pattern of spatially-partitioned video processing, where data locality is high and most communication occurs between neighboring tiles.
|
||||
|
||||
Dojo's efficiency for its target workload is exceptional, but the system cannot run Transformer-based language models efficiently because the 2D mesh topology and small per-core memory are poorly matched to the all-to-all communication and large weight matrices characteristic of Transformer training. Training our `{python} gpt3_params_b`B model on Dojo would be impractical: the 350 GB weight tensor would need to be distributed across thousands of tiles, and the all-to-all communication required for tensor parallelism would traverse dozens of hops in the 2D mesh, creating latencies that would dominate the training step. This makes Dojo a high-stakes bet on the continued centrality of vision models to Tesla's autonomous driving stack, and it illustrates the extreme end of the generality-efficiency trade-off at the system level. If Tesla's workload shifts toward large language models (as the industry trend suggests), the Dojo architecture would need fundamental redesign -- a risk that custom silicon always carries.
|
||||
Dojo's efficiency for its target workload is exceptional, but the system cannot run Transformer-based language models efficiently because the 2D mesh topology and small per-core memory are poorly matched to the all-to-all communication and large weight matrices characteristic of Transformer training. Training our `{python} frontier_params_b`B model on Dojo would be impractical: the 350 GB weight tensor would need to be distributed across thousands of tiles, and the all-to-all communication required for tensor parallelism would traverse dozens of hops in the 2D mesh, creating latencies that would dominate the training step. This makes Dojo a high-stakes bet on the continued centrality of vision models to Tesla's autonomous driving stack, and it illustrates the extreme end of the generality-efficiency trade-off at the system level. If Tesla's workload shifts toward large language models (as the industry trend suggests), the Dojo architecture would need fundamental redesign -- a risk that custom silicon always carries.
|
||||
|
||||
The choice between fat-tree, torus, and rail-optimized topologies has quantitative implications for training throughput that can alter AllReduce time by 2--3$\times$ for the same cluster size. The key insight for infrastructure planning is that topology choice is not separable from workload selection: Transformer training with its regular AllReduce patterns favors different topologies than Mixture-of-Experts models with their AllToAll communication. The network fabric cost (10--15% of total system cost) is easily justified if it improves scaling efficiency by even a few percentage points, because poor scaling wastes the other 85--90% of the investment. @sec-network-fabrics provides the rigorous quantitative comparison of these topologies, including formal bandwidth analysis and the mathematical framework for computing bisection bandwidth and path diversity.
|
||||
|
||||
The cost structure of the network fabric itself warrants quantitative examination. An InfiniBand NDR switch with 64 ports of 400 Gbps costs \$15,000--30,000, with specialized active optical cables adding \$500--1,000 per link. A two-tier fat-tree for 1,024 GPUs (128 nodes) requires approximately 64 leaf switches and 32 spine switches, plus roughly 4,000 cables, bringing the total fabric cost to \$5--15 million depending on the oversubscription ratio. Ethernet-based alternatives using RoCE (RDMA over Converged Ethernet) reduce the per-port cost by 30--50%, but introduce a latency and reliability penalty. Ethernet is inherently lossy: when switch buffers overflow under the bursty, synchronized traffic patterns of distributed training, packets are dropped and must be retransmitted. For our `{python} gpt3_params_b`B model training on 1,024 GPUs, even a 1% packet retransmission rate on a RoCE fabric can degrade effective AllReduce throughput by 10--20%, because thousands of GPUs wait for the slowest participant. The cost-performance frontier analysis asks whether the \$3--5 million savings of Ethernet justifies the potential 10--20% throughput loss. For frontier model training where time-to-market is paramount and the GPU fleet represents a \$35+ million investment, the InfiniBand premium pays for itself by ensuring the network never becomes the bottleneck.
|
||||
The cost structure of the network fabric itself warrants quantitative examination. An InfiniBand NDR switch with 64 ports of 400 Gbps costs \$15,000--30,000, with specialized active optical cables adding \$500--1,000 per link. A two-tier fat-tree for 1,024 GPUs (128 nodes) requires approximately 64 leaf switches and 32 spine switches, plus roughly 4,000 cables, bringing the total fabric cost to \$5--15 million depending on the oversubscription ratio. Ethernet-based alternatives using RoCE (RDMA over Converged Ethernet) reduce the per-port cost by 30--50%, but introduce a latency and reliability penalty. Ethernet is inherently lossy: when switch buffers overflow under the bursty, synchronized traffic patterns of distributed training, packets are dropped and must be retransmitted. For our `{python} frontier_params_b`B model training on 1,024 GPUs, even a 1% packet retransmission rate on a RoCE fabric can degrade effective AllReduce throughput by 10--20%, because thousands of GPUs wait for the slowest participant. The cost-performance frontier analysis asks whether the \$3--5 million savings of Ethernet justifies the potential 10--20% throughput loss. For frontier model training where time-to-market is paramount and the GPU fleet represents a \$35+ million investment, the InfiniBand premium pays for itself by ensuring the network never becomes the bottleneck.
|
||||
|
||||
#### Site Selection and Physical Constraints
|
||||
|
||||
@@ -2673,7 +2673,7 @@ The total cost of an ML cluster decomposes into two broad categories. **Capital
|
||||
- **Electricity** (60--70% of OpEx): At \$0.07/kWh, a 1,000-GPU H100 cluster consuming 1 MW (including cooling at PUE 1.1) costs approximately \$615,000 per year in electricity alone.
|
||||
- **Staffing and maintenance** (20--30% of OpEx): System administrators, hardware technicians, replacement parts, and software license fees.
|
||||
|
||||
For our `{python} gpt3_params_b`B model, a minimum viable training cluster requires approximately 1,000 H100 GPUs spread across 125 nodes to complete a training run in 2--4 weeks. Evaluating the TCO over a three-year lifecycle reveals a stark utilization dependency. The hardware CapEx dominates at \$43.75 million (\$350,000 per node), supported by a \$5 million investment in a two-tier InfiniBand fat-tree network and a proportional \$10 million facility allocation. Operational costs add approximately \$1.5 million annually for electricity (at \$0.07/kWh with a PUE of 1.1) and specialized staffing, bringing the three-year total to roughly \$63 million. If this dedicated cluster only trains six frontier models per year, the effective cost per run is \$3.5 million. Conversely, executing the same workload on the public cloud at \$4.00 per GPU-hour with 80% utilization would cost approximately \$1.075 million per run, totaling \$19.3 million over three years -- less than a third of the on-premises investment. The economic advantage of owning hardware only materializes at *continuous utilization*: if the cluster runs 24/7 (supporting not just training but also inference, fine-tuning, and experimentation), the effective on-premises cost drops to approximately \$2.40 per GPU-hour, significantly undercutting the cloud rate. This utilization dependency is the central tension in every build-vs-buy analysis.
|
||||
For our `{python} frontier_params_b`B model, a minimum viable training cluster requires approximately 1,000 H100 GPUs spread across 125 nodes to complete a training run in 2--4 weeks. Evaluating the TCO over a three-year lifecycle reveals a stark utilization dependency. The hardware CapEx dominates at \$43.75 million (\$350,000 per node), supported by a \$5 million investment in a two-tier InfiniBand fat-tree network and a proportional \$10 million facility allocation. Operational costs add approximately \$1.5 million annually for electricity (at \$0.07/kWh with a PUE of 1.1) and specialized staffing, bringing the three-year total to roughly \$63 million. If this dedicated cluster only trains six frontier models per year, the effective cost per run is \$3.5 million. Conversely, executing the same workload on the public cloud at \$4.00 per GPU-hour with 80% utilization would cost approximately \$1.075 million per run, totaling \$19.3 million over three years -- less than a third of the on-premises investment. The economic advantage of owning hardware only materializes at *continuous utilization*: if the cluster runs 24/7 (supporting not just training but also inference, fine-tuning, and experimentation), the effective on-premises cost drops to approximately \$2.40 per GPU-hour, significantly undercutting the cloud rate. This utilization dependency is the central tension in every build-vs-buy analysis.
|
||||
|
||||
### Build vs. Buy {#sec-compute-build-vs-buy}
|
||||
|
||||
@@ -2762,7 +2762,7 @@ The build-vs-buy decision is therefore not a simple arithmetic exercise but a st
|
||||
|
||||
Many organizations find that a hybrid approach, using owned infrastructure for predictable baseline workloads and cloud for peak demand or experimental workloads, provides the best balance of cost and flexibility. The hybrid model captures the cost advantage of ownership for the steady-state workload (which justifies the capital investment) while using the cloud's elasticity for the variable portion (avoiding the risk of over-provisioning owned hardware for peak demand that occurs only occasionally).
|
||||
|
||||
For our `{python} gpt3_params_b`B model, the build-vs-buy decision depends critically on the organization's training cadence. An organization that trains one frontier model per year and serves it for the remaining 11 months faces a fundamentally different calculus than one that continuously trains, fine-tunes, and experiments. The single-model organization would achieve perhaps 15--20% utilization on owned hardware (2--4 weeks of training out of 52 weeks), making cloud rental overwhelmingly cheaper. The continuous-training organization, running back-to-back experiments, hyperparameter sweeps, and model variants, can sustain 70--80% utilization, making owned hardware the clear winner. The hybrid approach serves the middle ground: own enough hardware for the continuous baseline workload (fine-tuning, inference, experimentation) and burst to the cloud for the periodic frontier training runs that temporarily require 5--10$\times$ the baseline capacity.
|
||||
For our `{python} frontier_params_b`B model, the build-vs-buy decision depends critically on the organization's training cadence. An organization that trains one frontier model per year and serves it for the remaining 11 months faces a fundamentally different calculus than one that continuously trains, fine-tunes, and experiments. The single-model organization would achieve perhaps 15--20% utilization on owned hardware (2--4 weeks of training out of 52 weeks), making cloud rental overwhelmingly cheaper. The continuous-training organization, running back-to-back experiments, hyperparameter sweeps, and model variants, can sustain 70--80% utilization, making owned hardware the clear winner. The hybrid approach serves the middle ground: own enough hardware for the continuous baseline workload (fine-tuning, inference, experimentation) and burst to the cloud for the periodic frontier training runs that temporarily require 5--10$\times$ the baseline capacity.
|
||||
|
||||
#### Operational Complexity
|
||||
|
||||
@@ -2791,11 +2791,11 @@ The value side of the equation includes:
|
||||
|
||||
This value-oriented perspective often changes the optimal infrastructure decision. A team that evaluates infrastructure purely on TCO may choose the cheapest option, which saves money on infrastructure but produces a slower research cycle and lower-quality models. A team that evaluates on TVO may choose a more expensive infrastructure option that pays for itself through faster iteration and better models.
|
||||
|
||||
The inference dimension of TVO deserves particular emphasis because it often dominates the total economic picture. Training our `{python} gpt3_params_b`B model is a one-time cost -- even at \$5 million per training run, it is a bounded expenditure. Serving the trained model, however, is an ongoing operational cost that accumulates indefinitely. A popular LLM serving 10 million queries per day, with each query generating an average of 500 tokens, processes 5 billion tokens daily. At an inference cost of \$2.00 per million tokens on H100 hardware, the daily serving cost is \$10,000, or approximately \$3.6 million per year. The cumulative inference cost exceeds the training cost within 6 months. This inversion means that infrastructure decisions optimized for training (maximizing TFLOPS per dollar) may be suboptimal for the model's total lifecycle cost. An organization that spends an additional \$2 million on training infrastructure to produce a model that is 20% more efficient at inference (through better architecture search enabled by faster experimentation) can recover that investment within months of deployment at scale.
|
||||
The inference dimension of TVO deserves particular emphasis because it often dominates the total economic picture. Training our `{python} frontier_params_b`B model is a one-time cost -- even at \$5 million per training run, it is a bounded expenditure. Serving the trained model, however, is an ongoing operational cost that accumulates indefinitely. A popular LLM serving 10 million queries per day, with each query generating an average of 500 tokens, processes 5 billion tokens daily. At an inference cost of \$2.00 per million tokens on H100 hardware, the daily serving cost is \$10,000, or approximately \$3.6 million per year. The cumulative inference cost exceeds the training cost within 6 months. This inversion means that infrastructure decisions optimized for training (maximizing TFLOPS per dollar) may be suboptimal for the model's total lifecycle cost. An organization that spends an additional \$2 million on training infrastructure to produce a model that is 20% more efficient at inference (through better architecture search enabled by faster experimentation) can recover that investment within months of deployment at scale.
|
||||
|
||||
::: {.callout-notebook title="The 10,000-GPU Cluster"}
|
||||
|
||||
Consider a cluster of 1,250 DGX H100 nodes (10,000 GPUs) for training our `{python} gpt3_params_b`B model.
|
||||
Consider a cluster of 1,250 DGX H100 nodes (10,000 GPUs) for training our `{python} frontier_params_b`B model.
|
||||
|
||||
**On-Premises (3-year lifecycle)**:
|
||||
|
||||
@@ -2842,7 +2842,7 @@ This is one reason why the cloud is often more cost-effective for organizations
|
||||
|
||||
Organizations with bursty workloads that still prefer owned hardware can partially mitigate the depreciation cost by renting out their idle capacity to other organizations through GPU-as-a-service platforms. Several companies have built businesses around this model, purchasing GPU clusters, renting them to training customers, and achieving economics that work only because high utilization across multiple customers amortizes the depreciation effectively.
|
||||
|
||||
For our `{python} gpt3_params_b`B model, the depreciation calculus is stark. A 1,000-GPU H100 cluster purchased in 2024 for $35 million will have a resale value of approximately $7--10 million by 2027, when the next-next-generation accelerators (post-Blackwell) are expected to deliver 3--4$\times$ the performance per watt. If the cluster trained only two frontier models during its lifetime, each model effectively cost $12--14 million in depreciated hardware alone -- before accounting for electricity, staffing, or facility costs. If the same cluster ran continuously at 80% utilization for three years (training, fine-tuning, inference, and experimentation), the depreciated hardware cost per GPU-hour drops to approximately $1.50, well below the cloud rate. The depreciation math reinforces the central lesson of TCO analysis: utilization is the single most important variable in determining whether owned infrastructure is economically viable.
|
||||
For our `{python} frontier_params_b`B model, the depreciation calculus is stark. A 1,000-GPU H100 cluster purchased in 2024 for $35 million will have a resale value of approximately $7--10 million by 2027, when the next-next-generation accelerators (post-Blackwell) are expected to deliver 3--4$\times$ the performance per watt. If the cluster trained only two frontier models during its lifetime, each model effectively cost $12--14 million in depreciated hardware alone -- before accounting for electricity, staffing, or facility costs. If the same cluster ran continuously at 80% utilization for three years (training, fine-tuning, inference, and experimentation), the depreciated hardware cost per GPU-hour drops to approximately $1.50, well below the cloud rate. The depreciation math reinforces the central lesson of TCO analysis: utilization is the single most important variable in determining whether owned infrastructure is economically viable.
|
||||
|
||||
### Power Efficiency Trajectory {#sec-compute-power-efficiency}
|
||||
|
||||
@@ -2871,7 +2871,7 @@ The interplay between CapEx and OpEx also shapes procurement strategy. Cloud pro
|
||||
|
||||
Some organizations adopt a hybrid approach: running baseline workloads on owned infrastructure for cost efficiency and bursting to the cloud for peak demand or for early access to the latest hardware generation before committing to a large purchase. This hybrid model is increasingly common among mid-sized AI companies that have a steady-state training workload (justifying owned hardware) but periodically need 2--3$\times$ their base capacity for new model training campaigns.
|
||||
|
||||
The power efficiency trajectory has a direct implication for our `{python} gpt3_params_b`B model's training economics. Training on 1,000 V100 GPUs would require approximately 300 kW of IT power and take roughly 8 months (given the V100's lower throughput). Training on 1,000 H100 GPUs requires 700 kW but completes in approximately 2--4 weeks. The H100 cluster consumes 2.3$\times$ more power per unit time but finishes 8--16$\times$ faster, resulting in a net energy reduction of 3.5--7$\times$ for the same training run. When electricity costs $0.07/kWh, the V100 training run costs approximately $120,000 in electricity while the H100 run costs approximately $25,000. The newer hardware is simultaneously faster, cheaper to operate, and more energy-efficient -- a rare alignment that makes hardware refresh decisions straightforward for organizations with the capital to invest.
|
||||
The power efficiency trajectory has a direct implication for our `{python} frontier_params_b`B model's training economics. Training on 1,000 V100 GPUs would require approximately 300 kW of IT power and take roughly 8 months (given the V100's lower throughput). Training on 1,000 H100 GPUs requires 700 kW but completes in approximately 2--4 weeks. The H100 cluster consumes 2.3$\times$ more power per unit time but finishes 8--16$\times$ faster, resulting in a net energy reduction of 3.5--7$\times$ for the same training run. When electricity costs $0.07/kWh, the V100 training run costs approximately $120,000 in electricity while the H100 run costs approximately $25,000. The newer hardware is simultaneously faster, cheaper to operate, and more energy-efficient -- a rare alignment that makes hardware refresh decisions straightforward for organizations with the capital to invest.
|
||||
|
||||
### GPU Procurement and Supply Chain {#sec-compute-procurement}
|
||||
|
||||
@@ -2884,7 +2884,7 @@ The GPU supply chain is unusually concentrated. NVIDIA holds approximately 80--9
|
||||
|
||||
The practical consequence of this concentration is that procurement timelines for large deployments (1,000+ GPUs) typically span 6--12 months from purchase decision to first production training job, encompassing negotiation, manufacturing, shipping, installation, and burn-in testing. During periods of intense demand (such as 2023--2024), this can stretch to 12--18 months. These lead times make GPU procurement a first-order planning constraint: organizations must commit capital and secure allocations months before the hardware is needed, often before the facility to house it is complete. The 6--12 month procurement horizon, combined with the 18--30 month facility construction timeline discussed earlier, means that infrastructure teams must plan two to three years ahead, making procurement strategy inseparable from the broader capacity planning process.
|
||||
|
||||
For our `{python} gpt3_params_b`B model, the procurement challenge is acute. A minimum viable training cluster of 1,000 H100 GPUs represents approximately $35 million in hardware alone. At this scale, the organization is not purchasing off-the-shelf products but negotiating directly with NVIDIA for an allocation from a constrained production pipeline. The negotiation involves not just price but delivery schedule, warranty terms, and often a commitment to purchase future generations. Organizations that delay procurement by even one quarter may find their allocation pushed back by 6--12 months, during which time a competitor with earlier access to the same hardware can complete a training run and capture the market advantage. This first-mover dynamic has led some organizations to commit hundreds of millions of dollars to GPU procurement before their models or training recipes are fully designed, treating hardware access as a strategic asset rather than a commodity input.
|
||||
For our `{python} frontier_params_b`B model, the procurement challenge is acute. A minimum viable training cluster of 1,000 H100 GPUs represents approximately $35 million in hardware alone. At this scale, the organization is not purchasing off-the-shelf products but negotiating directly with NVIDIA for an allocation from a constrained production pipeline. The negotiation involves not just price but delivery schedule, warranty terms, and often a commitment to purchase future generations. Organizations that delay procurement by even one quarter may find their allocation pushed back by 6--12 months, during which time a competitor with earlier access to the same hardware can complete a training run and capture the market advantage. This first-mover dynamic has led some organizations to commit hundreds of millions of dollars to GPU procurement before their models or training recipes are fully designed, treating hardware access as a strategic asset rather than a commodity input.
|
||||
|
||||
### Cloud Infrastructure Options {#sec-compute-cloud-options}
|
||||
|
||||
@@ -2900,13 +2900,13 @@ Third, all major providers offer **custom accelerator** options (purpose-built t
|
||||
|
||||
The pricing models across providers share three common tiers. **On-demand** instances provide immediate access at the highest per-hour cost. **Reserved instances** (1--3 year commitments) reduce costs by 40--60% but require upfront commitment and risk hardware obsolescence. **Spot/preemptible** instances offer 60--80% discounts but can be interrupted with minimal notice, making them suitable only for fault-tolerant workloads with frequent checkpointing. The choice between tiers is a risk management decision, balancing cost against the probability and impact of interruption.
|
||||
|
||||
The economics of spot instances deserve particular attention because they can dramatically reduce training costs for organizations with the engineering sophistication to exploit them. At a 70% discount, spot H100 instances cost approximately $1.20 per GPU-hour instead of $4.00. For our `{python} gpt3_params_b`B model requiring approximately 25,000 GPU-hours of training, the savings are substantial: $30,000 on spot versus $100,000 on demand. However, the stochastic nature of preemption transforms training from a deterministic process into a fault-tolerance engineering problem. If the cloud provider reclaims 5% of the nodes mid-training, a standard training job crashes instantly. Leveraging spot economics requires an elastic training framework (such as TorchElastic) that can dynamically rebalance the computation graph when nodes are added or removed. The economic viability hinges on the **checkpoint tax**: if the system must checkpoint every 10 minutes to limit data loss from preemption, and each checkpoint takes 30 seconds, approximately 5% of the "cheap" compute is consumed by I/O overhead. There exists a break-even point where the frequency of preemption events combined with checkpoint overhead makes spot instances more expensive in wall-clock time than reserved instances, despite the lower hourly rate.
|
||||
The economics of spot instances deserve particular attention because they can dramatically reduce training costs for organizations with the engineering sophistication to exploit them. At a 70% discount, spot H100 instances cost approximately $1.20 per GPU-hour instead of $4.00. For our `{python} frontier_params_b`B model requiring approximately 25,000 GPU-hours of training, the savings are substantial: $30,000 on spot versus $100,000 on demand. However, the stochastic nature of preemption transforms training from a deterministic process into a fault-tolerance engineering problem. If the cloud provider reclaims 5% of the nodes mid-training, a standard training job crashes instantly. Leveraging spot economics requires an elastic training framework (such as TorchElastic) that can dynamically rebalance the computation graph when nodes are added or removed. The economic viability hinges on the **checkpoint tax**: if the system must checkpoint every 10 minutes to limit data loss from preemption, and each checkpoint takes 30 seconds, approximately 5% of the "cheap" compute is consumed by I/O overhead. There exists a break-even point where the frequency of preemption events combined with checkpoint overhead makes spot instances more expensive in wall-clock time than reserved instances, despite the lower hourly rate.
|
||||
|
||||
A critical consideration for cloud-based ML infrastructure is **networking between instances**. Unlike on-premises clusters where the network topology is custom-designed, cloud instances share the provider's network fabric with other tenants. Most cloud providers now offer placement groups or dedicated networking fabrics that guarantee InfiniBand or equivalent bandwidth between instances within the same group.
|
||||
|
||||
However, cross-group or cross-zone communication may traverse shared infrastructure with lower bandwidth and higher latency. Training frameworks that span multiple placement groups or availability zones must account for this heterogeneous bandwidth topology, a challenge that does not arise in dedicated on-premises clusters. The practical consequence is that cloud-based training jobs must be sized to fit within a single placement group whenever possible, as crossing group boundaries can reduce scaling efficiency by 20--40%.
|
||||
|
||||
For our `{python} gpt3_params_b`B model, the cloud path presents a specific challenge: securing 1,000+ GPUs in a single placement group for a multi-week training run. Cloud providers typically limit placement group sizes to 256--512 GPUs, meaning that a frontier training run must either negotiate a custom allocation (often requiring a multi-million dollar commitment) or accept the performance penalty of spanning multiple groups. The availability of large contiguous GPU allocations varies by region and time of day, and organizations have reported waiting weeks for a sufficiently large allocation to become available during periods of peak demand. This availability uncertainty is a hidden cost of the cloud path that does not appear in the per-GPU-hour pricing but can delay training timelines significantly.
|
||||
For our `{python} frontier_params_b`B model, the cloud path presents a specific challenge: securing 1,000+ GPUs in a single placement group for a multi-week training run. Cloud providers typically limit placement group sizes to 256--512 GPUs, meaning that a frontier training run must either negotiate a custom allocation (often requiring a multi-million dollar commitment) or accept the performance penalty of spanning multiple groups. The availability of large contiguous GPU allocations varies by region and time of day, and organizations have reported waiting weeks for a sufficiently large allocation to become available during periods of peak demand. This availability uncertainty is a hidden cost of the cloud path that does not appear in the per-GPU-hour pricing but can delay training timelines significantly.
|
||||
|
||||
::: {.callout-checkpoint title="TCO Decision Framework"}
|
||||
|
||||
@@ -2932,7 +2932,7 @@ With the workload characterized, the planning follows a **bottom-up sizing** app
|
||||
|
||||
Step 1: Accelerator Selection. Based on the Roofline analysis of the workload (compute-bound or memory-bound), select the accelerator type. For compute-bound training: optimize for TFLOPS/dollar. For memory-bound inference: optimize for bandwidth/dollar.
|
||||
|
||||
Step 2: Node Sizing. Determine the number of accelerators per node and the memory tier allocation. For our `{python} gpt3_params_b`B model: 8 GPUs per node with tensor parallelism, 2 TB host DRAM for optimizer state offloading.
|
||||
Step 2: Node Sizing. Determine the number of accelerators per node and the memory tier allocation. For our `{python} frontier_params_b`B model: 8 GPUs per node with tensor parallelism, 2 TB host DRAM for optimizer state offloading.
|
||||
|
||||
Step 3: Cluster Sizing. Divide the total compute budget by the per-node sustained throughput (accounting for MFU) to determine the number of nodes. Add 5--10% overhead for maintenance pool and spares.
|
||||
|
||||
@@ -2944,7 +2944,7 @@ Step 6: TCO Analysis. Compute the 3-year total cost of ownership, including hard
|
||||
|
||||
Step 7: Timeline and Risk Assessment. Map the procurement, construction, and deployment timelines to the project schedule. Identify the longest-lead items (typically electrical infrastructure and GPU procurement) and start those procurement processes first.
|
||||
|
||||
To illustrate this methodology concretely, consider planning the infrastructure for training our `{python} gpt3_params_b`B model. **Step 1**: The training workload at large batch sizes is compute-bound, so we optimize for TFLOPS per dollar and select the H100 (highest available TFLOPS at reasonable cost). **Step 2**: The 2.2 TB training state requires 8-way tensor parallelism within each node, yielding a DGX H100 configuration. **Step 3**: The total compute budget is $6 \times 175 \times 10^9 \times 300 \times 10^9 = 3.15 \times 10^{23}$ FLOPs. At 45% MFU on H100s, each GPU delivers $1,979 \times 10^{12} \times 0.45 \approx 891$ TFLOPS sustained. With 1,024 GPUs, the cluster delivers $891 \times 1,024 \approx 912$ PFLOPS sustained, completing training in $3.15 \times 10^{23} / 9.12 \times 10^{17} \approx 345,000$ seconds or roughly 4 days of idealized compute time (2--4 weeks with operational overhead). **Step 4**: The TP-8, PP-4, DP-32 configuration generates structured AllReduce traffic suited to a rail-optimized InfiniBand fabric. **Step 5**: 128 nodes at `{python} rack_power_str` kW per 4-node rack requires 32 racks drawing approximately 1.1 MW total at PUE 1.08, necessitating liquid cooling. **Step 6**: The 3-year TCO is approximately $63M on-premises vs. $19M cloud for this specific workload, with the break-even depending on utilization beyond the initial training run. **Step 7**: GPU procurement (6--12 months) and facility preparation (if needed) must begin immediately, with phased deployment targeting initial capacity within 3 months.
|
||||
To illustrate this methodology concretely, consider planning the infrastructure for training our `{python} frontier_params_b`B model. **Step 1**: The training workload at large batch sizes is compute-bound, so we optimize for TFLOPS per dollar and select the H100 (highest available TFLOPS at reasonable cost). **Step 2**: The 2.2 TB training state requires 8-way tensor parallelism within each node, yielding a DGX H100 configuration. **Step 3**: The total compute budget is $6 \times 175 \times 10^9 \times 300 \times 10^9 = 3.15 \times 10^{23}$ FLOPs. At 45% MFU on H100s, each GPU delivers $1,979 \times 10^{12} \times 0.45 \approx 891$ TFLOPS sustained. With 1,024 GPUs, the cluster delivers $891 \times 1,024 \approx 912$ PFLOPS sustained, completing training in $3.15 \times 10^{23} / 9.12 \times 10^{17} \approx 345,000$ seconds or roughly 4 days of idealized compute time (2--4 weeks with operational overhead). **Step 4**: The TP-8, PP-4, DP-32 configuration generates structured AllReduce traffic suited to a rail-optimized InfiniBand fabric. **Step 5**: 128 nodes at `{python} rack_power_str` kW per 4-node rack requires 32 racks drawing approximately 1.1 MW total at PUE 1.08, necessitating liquid cooling. **Step 6**: The 3-year TCO is approximately $63M on-premises vs. $19M cloud for this specific workload, with the break-even depending on utilization beyond the initial training run. **Step 7**: GPU procurement (6--12 months) and facility preparation (if needed) must begin immediately, with phased deployment targeting initial capacity within 3 months.
|
||||
|
||||
::: {.callout-checkpoint title="Infrastructure Planning Exercise"}
|
||||
|
||||
@@ -2968,7 +2968,7 @@ The economics of infrastructure complete our tour of the physical stack. From th
|
||||
|
||||
The infrastructure stack described in the preceding sections represents the state of the art as of 2024. Several technologies under active development, however, could fundamentally alter the constraints that shape fleet design. Understanding these emerging directions is important for infrastructure planners because the decisions made today (facility design, power provisioning, cooling architecture) must accommodate hardware that does not yet exist. A datacenter built in 2025 will host three or four generations of accelerators over its 15-year structural lifetime, and each generation may demand capabilities that the facility must already support.
|
||||
|
||||
Each of the technologies below targets one of the three walls that have governed this chapter. CXL and disaggregated architectures attack the Memory Wall by decoupling capacity from individual accelerators. Optical interconnects attack the Communication Wall by narrowing the bandwidth gap between intra-node and inter-node links. Wafer-scale integration attacks all three walls simultaneously by eliminating the off-chip boundary entirely. For our `{python} gpt3_params_b`B model, these technologies would progressively relax the constraints that currently force complex multi-node parallelism: if CXL memory pooling provides terabyte-scale capacity accessible from any accelerator, and optical interconnects deliver NVLink-class bandwidth between nodes, the distinction between "within a node" and "across nodes" that drives hierarchy-aware parallelism would begin to dissolve.
|
||||
Each of the technologies below targets one of the three walls that have governed this chapter. CXL and disaggregated architectures attack the Memory Wall by decoupling capacity from individual accelerators. Optical interconnects attack the Communication Wall by narrowing the bandwidth gap between intra-node and inter-node links. Wafer-scale integration attacks all three walls simultaneously by eliminating the off-chip boundary entirely. For our `{python} frontier_params_b`B model, these technologies would progressively relax the constraints that currently force complex multi-node parallelism: if CXL memory pooling provides terabyte-scale capacity accessible from any accelerator, and optical interconnects deliver NVLink-class bandwidth between nodes, the distinction between "within a node" and "across nodes" that drives hierarchy-aware parallelism would begin to dissolve.
|
||||
|
||||
### Compute Express Link (CXL) {#sec-compute-cxl}
|
||||
|
||||
@@ -2984,7 +2984,7 @@ CXL changes this by creating a **unified memory fabric** where the GPU can direc
|
||||
|
||||
The more consequential application of CXL is **memory pooling**. CXL 3.0 enables a pool of memory devices (CXL memory expanders) connected to a CXL switch, with the pooled memory accessible by any processor or accelerator connected to the same switch. This decouples memory capacity from the number of compute devices: instead of each node having a fixed amount of DDR, a CXL memory pool can be dynamically allocated to whichever nodes need it most.
|
||||
|
||||
For ML training, memory pooling addresses the optimizer state problem directly. The 1,400 GB of optimizer state for our `{python} gpt3_params_b`B model could reside in a shared CXL memory pool rather than being replicated or sharded across individual nodes. Nodes that are executing the compute-intensive forward and backward passes would access only their weight and activation shards from HBM, while the optimizer state would be fetched from the CXL pool only during the parameter update step. This architecture would reduce the per-node memory requirement and potentially allow training of larger models on fewer nodes.
|
||||
For ML training, memory pooling addresses the optimizer state problem directly. The 1,400 GB of optimizer state for our `{python} frontier_params_b`B model could reside in a shared CXL memory pool rather than being replicated or sharded across individual nodes. Nodes that are executing the compute-intensive forward and backward passes would access only their weight and activation shards from HBM, while the optimizer state would be fetched from the CXL pool only during the parameter update step. This architecture would reduce the per-node memory requirement and potentially allow training of larger models on fewer nodes.
|
||||
|
||||
The challenge is bandwidth. CXL 3.0 over a PCIe Gen5 x16 link provides approximately 64 GB/s of read bandwidth, which is roughly 50$\times$ slower than HBM3 (3.35 TB/s). Data that must be accessed at HBM speeds (weights and activations during the forward and backward passes) cannot reside in CXL memory without creating severe bottlenecks. CXL memory is therefore a complement to HBM, not a replacement: it extends the capacity of the memory hierarchy without competing with HBM's bandwidth tier.
|
||||
|
||||
@@ -3030,7 +3030,7 @@ At the opposite extreme from disaggregation, **wafer-scale integration** maximiz
|
||||
|
||||
The approach directly attacks the Memory Wall. By replacing HBM (which is off-chip, connected via an interposer) with distributed on-chip SRAM (which is on-die, connected via a few-millimeter mesh), the WSE reduces data access latency from nanoseconds (HBM) to sub-nanosecond (SRAM) and increases aggregate bandwidth to levels that no discrete accelerator can match. For models that fit within the 44 GB of on-chip memory, the WSE can achieve near-100% utilization of its arithmetic units, because the memory system is never the bottleneck.
|
||||
|
||||
The limitation is that 44 GB of SRAM is insufficient for frontier models. Our `{python} gpt3_params_b`B model requires 350 GB for weights alone, far exceeding the WSE's capacity. Cerebras addresses this through a model-parallel scheme where the WSE processes layers sequentially, with weights streamed from external memory (MemoryX units) to the chip. This weight streaming approach works well for inference (where each layer is used once per token) but introduces pipelining complexity for training (where each layer is used in both forward and backward passes with different activations).
|
||||
The limitation is that 44 GB of SRAM is insufficient for frontier models. Our `{python} frontier_params_b`B model requires 350 GB for weights alone, far exceeding the WSE's capacity. Cerebras addresses this through a model-parallel scheme where the WSE processes layers sequentially, with weights streamed from external memory (MemoryX units) to the chip. This weight streaming approach works well for inference (where each layer is used once per token) but introduces pipelining complexity for training (where each layer is used in both forward and backward passes with different activations).
|
||||
|
||||
Wafer-scale integration also faces manufacturing challenges. Because the chip spans an entire wafer, individual defective cores must be disabled and routed around using redundant interconnect paths. The yield model is fundamentally different from conventional chips: rather than discarding entire dies with defects, the WSE includes spare cores and spare interconnect links that can be activated to replace defective ones. This redundancy-based yield strategy works because neural network workloads are tolerant of slight variations in available compute (losing 1% of cores has negligible impact on training throughput).
|
||||
|
||||
@@ -3048,7 +3048,7 @@ Fallacy: *More GPUs always means faster training.*
|
||||
|
||||
Engineers frequently assume that training time scales linearly with GPU count: doubling the GPUs should halve the training time. In practice, communication overhead grows with cluster size and eventually dominates.
|
||||
|
||||
Amdahl's Law establishes the theoretical limit: the sequential fraction of the computation (gradient synchronization, pipeline bubble time, data loading stalls) bounds the maximum speedup regardless of parallelism. For a `{python} gpt3_params_b`B-parameter model with 350 GB of gradients, the AllReduce at each training step requires every GPU to exchange data with every other GPU.
|
||||
Amdahl's Law establishes the theoretical limit: the sequential fraction of the computation (gradient synchronization, pipeline bubble time, data loading stalls) bounds the maximum speedup regardless of parallelism. For a `{python} frontier_params_b`B-parameter model with 350 GB of gradients, the AllReduce at each training step requires every GPU to exchange data with every other GPU.
|
||||
|
||||
On a cluster of 1,024 GPUs, this synchronization can consume 30--50% of the total step time if the network is not carefully engineered. Scaling from 1,024 to 2,048 GPUs doubles the hardware cost but may reduce training time by only 30--40%, yielding rapidly diminishing returns. The scaling efficiency curve is concave: the first 100 GPUs provide nearly linear speedup, the next 900 provide diminishing returns, and beyond 2,000--4,000 GPUs the marginal benefit per GPU approaches zero for most model sizes.
|
||||
|
||||
@@ -3098,17 +3098,17 @@ Teams that focus all their pre-deployment optimization on GPU kernel performance
|
||||
|
||||
Fallacy: *Homogeneous clusters are always better than heterogeneous ones.*
|
||||
|
||||
The intuition that uniform hardware simplifies scheduling and reduces stragglers often leads organizations to retire capable older generations prematurely. For our `{python} gpt3_params_b`B model, the cost-optimal strategy frequently involves a mixed-generation fleet. While training requires the raw FLOPS and interconnect bandwidth of H100s to minimize synchronization overhead, inference serving is memory-bandwidth-bound rather than compute-bound. An A100 offers 2.0 TB/s of bandwidth at a significantly lower capital expenditure than the H100's 3.35 TB/s. By dedicating H100 nodes to training and A100 nodes to inference, an organization can reduce TCO by 15--25% compared to an all-H100 fleet. The fallacy lies in conflating *job-level* homogeneity -- which is critical to prevent stragglers within a single distributed training run -- with *cluster-level* homogeneity. A sophisticated scheduler can effectively manage a heterogeneous fleet, routing bandwidth-intensive inference jobs to older hardware where the bandwidth-per-dollar ratio is competitive, while reserving peak compute nodes for training throughput.
|
||||
The intuition that uniform hardware simplifies scheduling and reduces stragglers often leads organizations to retire capable older generations prematurely. For our `{python} frontier_params_b`B model, the cost-optimal strategy frequently involves a mixed-generation fleet. While training requires the raw FLOPS and interconnect bandwidth of H100s to minimize synchronization overhead, inference serving is memory-bandwidth-bound rather than compute-bound. An A100 offers 2.0 TB/s of bandwidth at a significantly lower capital expenditure than the H100's 3.35 TB/s. By dedicating H100 nodes to training and A100 nodes to inference, an organization can reduce TCO by 15--25% compared to an all-H100 fleet. The fallacy lies in conflating *job-level* homogeneity -- which is critical to prevent stragglers within a single distributed training run -- with *cluster-level* homogeneity. A sophisticated scheduler can effectively manage a heterogeneous fleet, routing bandwidth-intensive inference jobs to older hardware where the bandwidth-per-dollar ratio is competitive, while reserving peak compute nodes for training throughput.
|
||||
|
||||
Pitfall: *Underestimating the time and cost of the "last mile" -- installation, burn-in, and commissioning.*
|
||||
|
||||
Engineering teams often allocate 90% of their planning effort to hardware selection and facility design, assuming that racking and stacking is a deterministic commodity task. In reality, the last mile -- physically installing servers, routing cables, filling coolant loops, and running burn-in tests -- frequently delays production availability by 2--4 months. A cluster for our `{python} gpt3_params_b`B model involves thousands of cables; a single loose InfiniBand connection or a pinched fiber optic cable can degrade effective bisection bandwidth by 50%, stalling distributed training. Insidious issues like firmware incompatibilities between GPU driver versions and InfiniBand switch firmware, or NUMA misconfigurations in the BIOS, often manifest only under sustained load. Experienced infrastructure teams allocate 15--20% of the total project timeline specifically for commissioning and burn-in, running synthetic stress tests (NCCL-tests, HPL benchmarks) for weeks to weed out "infant mortality" failures before a single production job is scheduled.
|
||||
Engineering teams often allocate 90% of their planning effort to hardware selection and facility design, assuming that racking and stacking is a deterministic commodity task. In reality, the last mile -- physically installing servers, routing cables, filling coolant loops, and running burn-in tests -- frequently delays production availability by 2--4 months. A cluster for our `{python} frontier_params_b`B model involves thousands of cables; a single loose InfiniBand connection or a pinched fiber optic cable can degrade effective bisection bandwidth by 50%, stalling distributed training. Insidious issues like firmware incompatibilities between GPU driver versions and InfiniBand switch firmware, or NUMA misconfigurations in the BIOS, often manifest only under sustained load. Experienced infrastructure teams allocate 15--20% of the total project timeline specifically for commissioning and burn-in, running synthetic stress tests (NCCL-tests, HPL benchmarks) for weeks to weed out "infant mortality" failures before a single production job is scheduled.
|
||||
|
||||
## Summary {#sec-compute-summary}
|
||||
|
||||
\index{Compute Infrastructure!summary}
|
||||
|
||||
This chapter has traced the physical infrastructure of machine learning from the transistor to the datacenter, using the running example of training a `{python} gpt3_params_b`B-parameter model to ground each concept in quantitative reality. The infrastructure stack is not a collection of independent components but an integrated system where decisions at each level constrain and enable choices at every other level. The accelerator's TDP determines the rack's cooling requirements. The rack's power density determines the pod's physical layout. The pod's network topology determines the achievable scaling efficiency. The scaling efficiency, in turn, determines the economics that make the entire enterprise viable or futile.
|
||||
This chapter has traced the physical infrastructure of machine learning from the transistor to the datacenter, using the running example of training a `{python} frontier_params_b`B-parameter model to ground each concept in quantitative reality. The infrastructure stack is not a collection of independent components but an integrated system where decisions at each level constrain and enable choices at every other level. The accelerator's TDP determines the rack's cooling requirements. The rack's power density determines the pod's physical layout. The pod's network topology determines the achievable scaling efficiency. The scaling efficiency, in turn, determines the economics that make the entire enterprise viable or futile.
|
||||
|
||||
The ML fleet is built through a series of progressive responses to physical constraints. Each constraint at one level of the hierarchy creates the engineering motivation for the next level:
|
||||
|
||||
@@ -3140,19 +3140,19 @@ The physical infrastructure is not merely a container for software; it is a cons
|
||||
# │ Show: ~175B parameters; ~1,979 TFLOPS; ~3.35 TB/s; ~27 kW rack.
|
||||
# │ How: pulling constants from mlsys.constants.
|
||||
# │
|
||||
# │ Imports: mlsys.constants (GPT3_PARAMS, H100_FLOPS_FP16_TENSOR, H100_MEM_BW,
|
||||
# │ Imports: mlsys.constants (Models.GPT3.parameters, system.peak_flops, system.memory_bw,
|
||||
# │ H100_TDP, GPUS_PER_HOST, param, BILLION, TFLOPs, second, TB, kilowatt)
|
||||
# │ Exports: gpt3_params_b, h100_tflops, h100_bw, rack_power_str
|
||||
# └─────────────────────────────────────────────────────────────────────────────
|
||||
from mlsys.constants import GPT3_PARAMS, H100_FLOPS_FP16_TENSOR, H100_MEM_BW, H100_TDP, GPUS_PER_HOST, param, BILLION, TFLOPs, second, TB, kilowatt
|
||||
from mlsys.constants import Models.GPT3.parameters, system.peak_flops, system.memory_bw, H100_TDP, GPUS_PER_HOST, param, BILLION, TFLOPs, second, TB, kilowatt
|
||||
|
||||
class SummaryScaleScenario:
|
||||
"""Namespace for summary scale reference."""
|
||||
|
||||
# ┌── 1. LOAD (Constants) ──────────────────────────────────────────────
|
||||
params = GPT3_PARAMS
|
||||
flops = H100_FLOPS_FP16_TENSOR
|
||||
bw = H100_MEM_BW
|
||||
params = Models.GPT3.parameters
|
||||
flops = system.peak_flops
|
||||
bw = system.memory_bw
|
||||
tdp = H100_TDP
|
||||
gpus_per_node = GPUS_PER_HOST
|
||||
|
||||
@@ -3173,9 +3173,9 @@ h100_bw = SummaryScaleScenario.h100_bw
|
||||
rack_power_str = SummaryScaleScenario.rack_power_str
|
||||
```
|
||||
|
||||
Our `{python} gpt3_params_b`B-parameter model has served as the persistent architectural forcing function connecting every layer of this hierarchy. At the accelerator level, processing a single token requires 350 billion floating-point operations, demanding specialized Tensor Cores that deliver nearly `{python} h100_tflops` TFLOPS to achieve interactive latency. At the memory level, the 350 GB weight tensor saturates HBM interfaces; streaming this volume at `{python} h100_bw` TB/s creates a hard latency floor of over 100 ms per token, regardless of arithmetic throughput. Moving to the node level, the full training state -- comprising weights, gradients, and optimizer states -- expands to over 2 TB, shattering the 80 GB limit of individual chips and necessitating 8-way tensor parallelism across NVLink. At the rack level, the thermal density of 32 such accelerators drawing 700 W each necessitates `{python} rack_power_str` kW of power delivery and liquid cooling infrastructure. At the pod level, training within a viable two-week window requires synchronizing over 1,000 GPUs across a non-blocking InfiniBand fabric, where the statistical certainty of hardware failure forces a checkpointing strategy that trades compute cycles for reliability. At the economics level, the 3-year TCO of the required cluster ranges from $63M (on-premises at high utilization) to hundreds of millions (cloud), with the break-even utilization determining which path is viable. Every constraint in this chapter -- from the width of the HBM bus to the topology of the datacenter network -- traces back to the single arithmetic fact that this model does not fit on a single chip.
|
||||
Our `{python} frontier_params_b`B-parameter model has served as the persistent architectural forcing function connecting every layer of this hierarchy. At the accelerator level, processing a single token requires 350 billion floating-point operations, demanding specialized Tensor Cores that deliver nearly `{python} h100_tflops` TFLOPS to achieve interactive latency. At the memory level, the 350 GB weight tensor saturates HBM interfaces; streaming this volume at `{python} h100_bw` TB/s creates a hard latency floor of over 100 ms per token, regardless of arithmetic throughput. Moving to the node level, the full training state -- comprising weights, gradients, and optimizer states -- expands to over 2 TB, shattering the 80 GB limit of individual chips and necessitating 8-way tensor parallelism across NVLink. At the rack level, the thermal density of 32 such accelerators drawing 700 W each necessitates `{python} rack_power_str` kW of power delivery and liquid cooling infrastructure. At the pod level, training within a viable two-week window requires synchronizing over 1,000 GPUs across a non-blocking InfiniBand fabric, where the statistical certainty of hardware failure forces a checkpointing strategy that trades compute cycles for reliability. At the economics level, the 3-year TCO of the required cluster ranges from $63M (on-premises at high utilization) to hundreds of millions (cloud), with the break-even utilization determining which path is viable. Every constraint in this chapter -- from the width of the HBM bus to the topology of the datacenter network -- traces back to the single arithmetic fact that this model does not fit on a single chip.
|
||||
|
||||
For our `{python} gpt3_params_b`B model, the chapter has shown that infrastructure selection is not a single decision but a cascade of interdependent choices. The accelerator determines the per-chip throughput and memory bandwidth. The node design determines how many accelerators can cooperate efficiently on tensor-parallel operations. The rack design determines how much power and cooling is available per unit of floor space. The pod design determines how many nodes can synchronize gradients without communication overhead dominating the training loop.
|
||||
For our `{python} frontier_params_b`B model, the chapter has shown that infrastructure selection is not a single decision but a cascade of interdependent choices. The accelerator determines the per-chip throughput and memory bandwidth. The node design determines how many accelerators can cooperate efficiently on tensor-parallel operations. The rack design determines how much power and cooling is available per unit of floor space. The pod design determines how many nodes can synchronize gradients without communication overhead dominating the training loop.
|
||||
|
||||
The TCO analysis determines whether the entire enterprise makes financial sense. Each choice constrains the next, creating a system where no component can be optimized in isolation. Selecting the fastest accelerator is counterproductive if the cooling infrastructure cannot remove its heat. Building the densest rack is futile if the power grid cannot supply its demand. Deploying the widest network is wasteful if the workload's communication pattern does not use the bandwidth. Infrastructure engineering is systems engineering in its purest form: every component exists in a web of physical, economic, and operational constraints that must be satisfied simultaneously.
|
||||
|
||||
@@ -3193,7 +3193,7 @@ The TCO analysis determines whether the entire enterprise makes financial sense.
|
||||
|
||||
:::
|
||||
|
||||
The practitioner's central takeaway from this infrastructure stack is that no component exists in isolation. The `{python} gpt3_params_b`B-parameter running example was chosen precisely because its scale forces engagement with every level of the hierarchy: a model that fit comfortably on a single chip would never expose the cooling constraints, network topology trade-offs, or failure-rate arithmetic that dominate real production systems. By tracing a single model from the arithmetic unit through HBM, NVLink, the rack power envelope, and the datacenter fabric, the chapter demonstrates a mode of reasoning that transfers to any model at any scale. The specific numbers will change as hardware generations advance, but the method of analysis remains: identify the binding constraint at each level, quantify its impact, and propagate its consequences upward through the stack.
|
||||
The practitioner's central takeaway from this infrastructure stack is that no component exists in isolation. The `{python} frontier_params_b`B-parameter running example was chosen precisely because its scale forces engagement with every level of the hierarchy: a model that fit comfortably on a single chip would never expose the cooling constraints, network topology trade-offs, or failure-rate arithmetic that dominate real production systems. By tracing a single model from the arithmetic unit through HBM, NVLink, the rack power envelope, and the datacenter fabric, the chapter demonstrates a mode of reasoning that transfers to any model at any scale. The specific numbers will change as hardware generations advance, but the method of analysis remains: identify the binding constraint at each level, quantify its impact, and propagate its consequences upward through the stack.
|
||||
|
||||
This systems-level perspective is what separates infrastructure engineering from hardware shopping. An engineer who understands only accelerator FLOPS will over-provision compute and under-provision cooling. An engineer who understands only network bandwidth will design a topology that exceeds the facility's power budget. The discipline this chapter cultivates is the ability to hold the entire constraint graph in mind simultaneously, reasoning about how a change in numerical precision at the chip level alters the power draw at the rack level, the cooling load at the facility level, and the TCO at the business level. That integrative judgment, grounded in quantitative analysis rather than intuition, is the foundation on which every subsequent chapter in this volume builds.
|
||||
|
||||
|
||||
Reference in New Issue
Block a user