Files
cs249r_book/labs/vol2/lab_03_network_fabrics.py
Vijay Janapa Reddi 6f5732558f feat: add complete first-draft labs for both volumes (33 Marimo labs)
Add all Vol1 (labs 01-16) and Vol2 (labs 01-17) interactive Marimo labs
as the first full first-pass implementation of the ML Systems curriculum labs.

Each lab follows the PROTOCOL 2-Act structure (35-40 min):
- Act I: Calibration with prediction lock → instruments → overlay
- Act II: Design challenge with failure states and reflection

Key pedagogical instruments introduced progressively:
- Vol1: D·A·M Triad, Iron Law, Memory Ledger, Roofline, Amdahl's Law,
  Little's Law, P99 Histogram, Compression Frontier, Chouldechova theorem
- Vol2: NVLink vs PCIe cliff, Bisection BW, Young-Daly T*, Parallelism Paradox,
  AllReduce ring vs tree, KV-cache model, Jevons Paradox, DP ε-δ tradeoff,
  SLO composition, Adversarial Pareto, two-volume synthesis capstone

All 35 staged files pass AST syntax verification (36/36 including lab_00).

Also includes:
- labs/LABS_SPEC.md: authoritative sub-agent brief for all lab conventions
- labs/core/style.py: expanded unified design system with semantic color tokens
2026-03-01 19:59:04 -05:00

1561 lines
70 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
import marimo
__generated_with = "0.19.6"
app = marimo.App(width="full")
# ─────────────────────────────────────────────────────────────────────────────
# LAB 03: THE BISECTION BANDWIDTH WALL
#
# Chapter: network_fabrics.qmd (@sec-network-fabrics)
# Volume II · Lab 03
#
# Core Invariant: Fat-tree topology provides full bisection bandwidth.
# As clusters scale, bisection bandwidth per GPU decreases unless the fabric
# is properly overprovisioned. The bisection bandwidth wall is why not all
# topologies scale equally.
#
# Two Acts:
# Act I — The Bisection Bandwidth Blindspot (1215 min)
# A 128-GPU cluster uses 2:1 oversubscription. AllReduce is 40% of expected.
# Why? Bisection bandwidth halved by oversubscription explains the drop.
#
# Act II — The 1024-GPU Fabric Design (2025 min)
# $50M budget, 3 fabric options: IB fat-tree, Eth fat-tree, 3D-torus.
# AllReduce at scale is dominated by bisection bandwidth, not per-link speed.
# Failure state: bisection BW insufficient → AllReduce > 1-second target.
#
# Contexts: 8-GPU cluster (single-node reference) vs 1024-GPU cluster (scale)
#
# Design Ledger: chapter="v2_03", context, fabric_type, cluster_size,
# oversubscription, bisection_bw_gbps, act1_prediction,
# act1_correct, act2_result, act2_decision, constraint_hit
#
# Hardware constants (source in comments on each constant):
# IB_HDR200_PORT_GBPS = 200 # InfiniBand HDR200 single port, Gb/s
# IB_NDR400_PORT_GBPS = 400 # InfiniBand NDR400 single port, Gb/s
# ETH_100G_PORT_GBPS = 100 # 100GbE port line rate, Gb/s
# ETH_400G_PORT_GBPS = 400 # 400GbE port line rate, Gb/s
# IB_HDR200_EFF_GBS = 22.5 # InfiniBand HDR200 effective unidirectional GB/s
# IB_NDR400_EFF_GBS = 45.0 # InfiniBand NDR400 effective unidirectional GB/s
# ETH_100G_EFF_GBS = 11.0 # 100GbE effective GB/s after TCP/IP overhead
# ETH_400G_EFF_GBS = 44.0 # 400GbE effective GB/s after overhead
# ─────────────────────────────────────────────────────────────────────────────
# ─── CELL 0: SETUP (hide_code=False — leave visible for instructor inspection) ─
@app.cell
def _():
import marimo as mo
import sys
import math
from pathlib import Path
import plotly.graph_objects as go
import numpy as np
_root = Path(__file__).resolve().parents[2]
if str(_root) not in sys.path:
sys.path.insert(0, str(_root))
from labs.core.state import DesignLedger
from labs.core.style import COLORS, LAB_CSS, apply_plotly_theme
# ── Hardware constants (source: vendor datasheets + @sec-network-fabrics) ──
# InfiniBand port rates — NVIDIA/Mellanox datasheet
IB_HDR200_PORT_GBPS = 200 # Gb/s per port, InfiniBand HDR200 (HDR)
IB_NDR400_PORT_GBPS = 400 # Gb/s per port, InfiniBand NDR (NDR400)
# Ethernet port rates — IEEE 802.3 standards
ETH_100G_PORT_GBPS = 100 # Gb/s per port, 100GbE
ETH_400G_PORT_GBPS = 400 # Gb/s per port, 400GbE
# Effective unidirectional bandwidth (Gb/s → GB/s with protocol overhead)
# InfiniBand: ~90% efficiency (minimal overhead), source: MLPerf network BW
IB_HDR200_EFF_GBS = 22.5 # GB/s unidirectional (200 Gbps × 0.9 / 8)
IB_NDR400_EFF_GBS = 45.0 # GB/s unidirectional (400 Gbps × 0.9 / 8)
# Ethernet: ~88% efficiency after TCP/UDP overhead
ETH_100G_EFF_GBS = 11.0 # GB/s unidirectional (100 Gbps × 0.88 / 8)
ETH_400G_EFF_GBS = 44.0 # GB/s unidirectional (400 Gbps × 0.88 / 8)
# Cost estimates for 1024-GPU fabric (Act II), source: industry white papers
# These are order-of-magnitude estimates for pedagogical framing
IB_NDR_FABRIC_COST_M = 20.0 # $M for 1024-GPU IB NDR fat-tree, non-blocking
ETH_400G_FABRIC_COST_M = 8.0 # $M for 1024-GPU 400GbE fat-tree, 2:1 oversub
TORUS_3D_FABRIC_COST_M = 5.0 # $M for 1024-GPU 3D-torus (fixed links)
# AllReduce ring-allreduce formula constant
# Ring AllReduce time = 2 * (N-1)/N * message_size / bisection_bw
# For large N: approaches 2 * message_size / bisection_bw
# Source: @sec-network-fabrics-allreduce-algorithms (Rabenseifner's algorithm)
ledger = DesignLedger()
return (
mo, ledger, COLORS, LAB_CSS, apply_plotly_theme,
go, np, math,
IB_HDR200_PORT_GBPS, IB_NDR400_PORT_GBPS,
ETH_100G_PORT_GBPS, ETH_400G_PORT_GBPS,
IB_HDR200_EFF_GBS, IB_NDR400_EFF_GBS,
ETH_100G_EFF_GBS, ETH_400G_EFF_GBS,
IB_NDR_FABRIC_COST_M, ETH_400G_FABRIC_COST_M, TORUS_3D_FABRIC_COST_M,
)
# ─── CELL 1: HEADER ───────────────────────────────────────────────────────────
@app.cell(hide_code=True)
def _(mo, LAB_CSS, COLORS):
mo.vstack([
LAB_CSS,
mo.Html(f"""
<div style="background: linear-gradient(135deg, #0f172a 0%, #1e293b 60%, #0c1a2e 100%);
padding: 36px 44px; border-radius: 16px; color: white;
box-shadow: 0 8px 32px rgba(0,0,0,0.35);">
<div style="font-size: 0.72rem; font-weight: 700; letter-spacing: 0.18em;
color: #475569; text-transform: uppercase; margin-bottom: 10px;">
Machine Learning Systems · Volume II · Lab 03
</div>
<h1 style="margin: 0 0 10px 0; font-size: 2.4rem; font-weight: 900;
color: #f8fafc; line-height: 1.1; letter-spacing: -0.02em;">
The Bisection Bandwidth Wall
</h1>
<p style="margin: 0 0 6px 0; font-size: 1.05rem; font-weight: 600;
color: #94a3b8; letter-spacing: 0.04em; font-family: 'SF Mono', monospace;">
BW_bisection = N × link_BW / (2 × oversubscription_ratio)
</p>
<p style="margin: 0 0 22px 0; font-size: 1.0rem; color: #64748b;
max-width: 700px; line-height: 1.65;">
Fat-tree topology promises full bisection bandwidth — every GPU reaches
every other GPU at full link speed. That promise holds only when the fabric
is non-blocking. Oversubscription quietly halves it. At 1024 GPUs, the choice
of topology determines whether AllReduce finishes in 200 ms or 4 seconds.
</p>
<div style="display: flex; gap: 12px; flex-wrap: wrap; margin-bottom: 20px;">
<span style="background: rgba(99,102,241,0.18); color: #a5b4fc;
padding: 5px 14px; border-radius: 20px; font-size: 0.8rem;
font-weight: 600; border: 1px solid rgba(99,102,241,0.3);">
2 Acts · 3540 min
</span>
<span style="background: rgba(203,32,45,0.15); color: #fca5a5;
padding: 5px 14px; border-radius: 20px; font-size: 0.8rem;
font-weight: 600; border: 1px solid rgba(203,32,45,0.25);">
Chapter: @sec-network-fabrics
</span>
<span style="background: rgba(0,143,69,0.15); color: #6ee7b7;
padding: 5px 14px; border-radius: 20px; font-size: 0.8rem;
font-weight: 600; border: 1px solid rgba(0,143,69,0.25);">
Instrument: Bisection BW Calculator
</span>
</div>
<div style="display: flex; gap: 10px; flex-wrap: wrap;">
<span class="badge badge-info">Bisection Bandwidth</span>
<span class="badge badge-warn">Oversubscription</span>
<span class="badge badge-fail">AllReduce Bottleneck</span>
<span class="badge badge-ok">Fat-Tree Non-Blocking</span>
</div>
</div>
"""),
])
return
# ─── CELL 2: RECOMMENDED READING ──────────────────────────────────────────────
@app.cell(hide_code=True)
def _(mo):
mo.callout(mo.md("""
**Recommended Reading** — Complete the following before this lab:
- **@sec-network-fabrics-bisection-bandwidth** (Bisection Bandwidth) — The minimum-cut
definition, why it sets the all-to-all communication ceiling, and the fat-tree formula.
- **@sec-network-fabrics-fat-tree-topology** (Fat-Tree Structure) — k-ary fat-tree
construction, oversubscription ratios, and the non-blocking guarantee.
- **@sec-network-fabrics-allreduce** (AllReduce Algorithms) — Ring-AllReduce bandwidth
analysis and why bisection bandwidth is the bottleneck for large clusters.
- **@sec-network-fabrics-topology-comparison** (Topology Trade-offs) — Fat-tree vs
3D-torus bisection bandwidth scaling: O(N) vs O(N^(2/3)).
"""), kind="info")
return
# ─── CELL 3: CONTEXT TOGGLE ───────────────────────────────────────────────────
@app.cell(hide_code=True)
def _(mo):
context_toggle = mo.ui.radio(
options={
"8-GPU Cluster (single-node reference scale)": "small",
"1024-GPU Cluster (production training scale)": "large",
},
value="8-GPU Cluster (single-node reference scale)",
label="Cluster context (sets scale for all computations):",
inline=True,
)
context_toggle
return (context_toggle,)
# ═════════════════════════════════════════════════════════════════════════════
# ACT I — THE BISECTION BANDWIDTH BLINDSPOT
# ═════════════════════════════════════════════════════════════════════════════
@app.cell(hide_code=True)
def _(mo):
mo.vstack([
mo.Html("""<div style="margin: 24px 0 8px 0;">
<div style="display:flex; align-items:center; gap:12px;">
<div style="background:#006395; color:white; border-radius:50%;
width:28px; height:28px; display:inline-flex; align-items:center;
justify-content:center; font-size:0.85rem; font-weight:800;
flex-shrink:0;">I</div>
<div style="flex:1; height:2px; background:#e2e8f0;"></div>
<div style="font-size:0.72rem; font-weight:700; color:#94a3b8;
text-transform:uppercase; letter-spacing:0.12em;">
Act I · 1215 min
</div>
</div>
<div style="font-size:1.6rem; font-weight:800; color:#0f172a;
margin-top:8px; line-height:1.2;">
The Bisection Bandwidth Blindspot
</div>
<div style="color:#475569; font-size:0.95rem; margin-top:4px;">
Why is AllReduce running at 40% of expected throughput on our new cluster?
</div>
</div>"""),
])
return
# ─── ACT I: STAKEHOLDER MESSAGE ───────────────────────────────────────────────
@app.cell(hide_code=True)
def _(mo, COLORS):
_color = COLORS["BlueLine"]
_bg = COLORS["BlueL"]
mo.Html(f"""
<div style="border-left:4px solid {_color}; background:{_bg};
border-radius:0 10px 10px 0; padding:16px 22px; margin:12px 0;">
<div style="font-size:0.72rem; font-weight:700; color:{_color};
text-transform:uppercase; letter-spacing:0.1em; margin-bottom:6px;">
Incoming Message · Network Architect, Meridian AI Infrastructure
</div>
<div style="font-style:italic; font-size:1.0rem; color:#1e293b; line-height:1.65;">
"We commissioned a new 128-GPU training cluster. To reduce fabric cost we used
a 2:1 oversubscription on the spine switches — half the uplinks of a full fat-tree.
The vendor assured us this is standard for most workloads. But our distributed
training AllReduce is running at 40% of the throughput we calculated. The GPUs are
idle 60% of the time waiting on gradients. What did we get wrong?"
</div>
<div style="font-size:0.78rem; color:#475569; margin-top:8px; font-weight:600;">
— Kenji Watanabe, Network Architect · Meridian AI Infrastructure (128-GPU cluster)
</div>
</div>
""")
return
# ─── ACT I: CONCEPT SETUP ─────────────────────────────────────────────────────
@app.cell(hide_code=True)
def _(mo):
mo.vstack([
mo.md("""
## Bisection Bandwidth is the All-to-All Ceiling
**Bisection bandwidth** is the minimum bandwidth when you cut a network into two
equal halves. It is the theoretical ceiling for any all-to-all communication pattern
— the kind AllReduce uses when every GPU must exchange gradients with every other GPU.
For a **fat-tree with no oversubscription** (non-blocking), the bisection bandwidth is:
```
BW_bisection = N × link_BW / 2
```
Where `N` is the number of GPUs and `link_BW` is the bandwidth of each GPU uplink.
Every GPU can communicate at full link speed simultaneously — "no blocking" means
no link is ever the bottleneck.
When **oversubscription ratio `r`** is applied (e.g., 2:1 means half the spine
uplinks are removed), bisection bandwidth is cut by the same factor:
```
BW_bisection (oversubscribed) = N × link_BW / (2 × r)
```
A 2:1 oversubscription **halves bisection bandwidth**. AllReduce throughput for
an all-to-all workload is bounded by bisection bandwidth, so performance is also
halved — not slightly degraded.
"""),
mo.callout(mo.md(
"**The 40% observation.** A 2:1 oversubscription halves bisection bandwidth. "
"If AllReduce efficiency was ~80% of theoretical on a non-blocking fabric, "
"it becomes ~40% on the oversubscribed fabric — exactly the observation above. "
"The GPU idleness is not a software bug. It is a physics constraint."
), kind="info"),
])
return
# ─── ACT I: PREDICTION LOCK ───────────────────────────────────────────────────
@app.cell(hide_code=True)
def _(mo):
act1_prediction = mo.ui.radio(
options={
"A) 40% of expected is close enough — oversubscription rarely matters for ML workloads": "option_a",
"B) The 2:1 oversubscription halves bisection bandwidth — all-to-all patterns see 50% of expected throughput": "option_b",
"C) The issue is latency, not bandwidth — too many hops in the fat-tree add queueing delay": "option_c",
"D) 128 GPUs exceeds the practical scaling limit for fat-tree topologies": "option_d",
},
label="""**Prediction Lock — Act I.**
A 128-GPU cluster uses InfiniBand HDR200 (200 Gbps per port) with a 2:1 oversubscription
ratio at the spine layer. A non-blocking fat-tree of the same cluster would have
bisection bandwidth = 128 × 25 GB/s / 2 = 1,600 GB/s total.
The AllReduce throughput is 40% of expected. Which explanation best accounts for this?""",
)
act1_prediction
return (act1_prediction,)
@app.cell(hide_code=True)
def _(mo, act1_prediction):
mo.stop(
act1_prediction.value is None,
mo.callout(
mo.md("Select your prediction above to unlock the Bisection Bandwidth instruments."),
kind="warn",
)
)
return
# ─── ACT I: INSTRUMENTS ───────────────────────────────────────────────────────
@app.cell(hide_code=True)
def _(mo, act1_prediction):
mo.stop(act1_prediction.value is None)
act1_cluster_size = mo.ui.slider(
start=8, stop=512, value=128, step=8,
label="Cluster size N (GPUs)",
)
act1_link_bw = mo.ui.dropdown(
options={
"IB HDR200 — 200 Gbps (22.5 GB/s eff)": "ib_hdr200",
"IB NDR400 — 400 Gbps (45.0 GB/s eff)": "ib_ndr400",
"100GbE — 100 Gbps (11.0 GB/s eff)": "eth_100g",
"400GbE — 400 Gbps (44.0 GB/s eff)": "eth_400g",
},
value="IB HDR200 — 200 Gbps (22.5 GB/s eff)",
label="Link type (per GPU uplink)",
)
act1_oversub = mo.ui.dropdown(
options={
"1:1 — Non-blocking (full fat-tree)": 1.0,
"2:1 — Half spine uplinks": 2.0,
"4:1 — Quarter spine uplinks": 4.0,
},
value="1:1 — Non-blocking (full fat-tree)",
label="Oversubscription ratio",
)
mo.vstack([
mo.md("### Bisection Bandwidth Calculator"),
mo.hstack([act1_cluster_size, act1_link_bw], justify="start", gap="2rem"),
act1_oversub,
])
return (act1_cluster_size, act1_link_bw, act1_oversub)
@app.cell(hide_code=True)
def _(
mo, act1_prediction, act1_cluster_size, act1_link_bw, act1_oversub,
go, apply_plotly_theme, COLORS,
IB_HDR200_EFF_GBS, IB_NDR400_EFF_GBS, ETH_100G_EFF_GBS, ETH_400G_EFF_GBS,
):
mo.stop(act1_prediction.value is None)
# ── Link bandwidth lookup ─────────────────────────────────────────────────
_link_map = {
"ib_hdr200": IB_HDR200_EFF_GBS,
"ib_ndr400": IB_NDR400_EFF_GBS,
"eth_100g": ETH_100G_EFF_GBS,
"eth_400g": ETH_400G_EFF_GBS,
}
_link_label_map = {
"ib_hdr200": "IB HDR200",
"ib_ndr400": "IB NDR400",
"eth_100g": "100GbE",
"eth_400g": "400GbE",
}
_link_bw_gbs = _link_map[act1_link_bw.value]
_link_label = _link_label_map[act1_link_bw.value]
_N = act1_cluster_size.value
_r = act1_oversub.value
# ── Bisection bandwidth formula ───────────────────────────────────────────
# BW_bisection = N × link_BW / (2 × oversubscription_ratio)
# Source: @sec-network-fabrics-bisection-bandwidth
_bw_full = _N * _link_bw_gbs / 2.0 # GB/s, non-blocking
_bw_oversub = _N * _link_bw_gbs / (2.0 * _r) # GB/s, with oversubscription
_bw_per_gpu = _bw_oversub / _N # GB/s per GPU (bisection share)
# ── AllReduce bandwidth efficiency ────────────────────────────────────────
# Ring-AllReduce sends 2*(N-1)/N * message_size bytes total per GPU.
# For large N, effective per-GPU AllReduce bandwidth ≈ bisection_bw / N.
# Efficiency relative to non-blocking fabric:
_efficiency_pct = (_bw_oversub / _bw_full) * 100.0 if _bw_full > 0 else 0.0
# ── Bottleneck classification ─────────────────────────────────────────────
if _r <= 1.0:
_fabric_status = "Non-blocking"
_status_color = COLORS["GreenLine"]
_status_bg = COLORS["GreenLL"]
elif _r <= 2.0:
_fabric_status = "2:1 Oversubscribed"
_status_color = COLORS["OrangeLine"]
_status_bg = COLORS["OrangeLL"]
else:
_fabric_status = "4:1 Oversubscribed"
_status_color = COLORS["RedLine"]
_status_bg = COLORS["RedLL"]
# ── Comparison bar chart: topology bandwidth comparison ───────────────────
_oversub_labels = ["1:1 (Non-blocking)", "2:1 Oversubscribed", "4:1 Oversubscribed"]
_oversub_ratios = [1.0, 2.0, 4.0]
_bw_values = [_N * _link_bw_gbs / (2.0 * r) for r in _oversub_ratios]
_bar_colors = [COLORS["GreenLine"], COLORS["OrangeLine"], COLORS["RedLine"]]
_selected_idx = _oversub_ratios.index(_r)
_fig = go.Figure()
_fig.add_trace(go.Bar(
x=_oversub_labels,
y=_bw_values,
marker_color=[
_bar_colors[i] if i != _selected_idx else "#1e293b"
for i in range(len(_bar_colors))
],
marker_line_color=[
_bar_colors[i] for i in range(len(_bar_colors))
],
marker_line_width=[
3 if i == _selected_idx else 1 for i in range(len(_bar_colors))
],
opacity=[0.55 if i != _selected_idx else 1.0 for i in range(3)],
text=[f"{v:,.0f} GB/s" for v in _bw_values],
textposition="outside",
textfont=dict(size=11, family="SF Mono, monospace"),
width=0.55,
))
# Highlight selected configuration
_fig.add_trace(go.Scatter(
x=[_oversub_labels[_selected_idx]],
y=[_bw_values[_selected_idx]],
mode="markers",
marker=dict(
symbol="star",
size=14,
color=_bar_colors[_selected_idx],
line=dict(color="white", width=1),
),
name="Current config",
showlegend=False,
))
_fig.update_layout(
title=dict(
text=f"Bisection Bandwidth by Oversubscription — {_N}-GPU cluster, {_link_label}",
font=dict(size=13, color="#1e293b"),
x=0,
),
height=320,
yaxis=dict(title="Total Bisection Bandwidth (GB/s)", gridcolor="#f1f5f9"),
xaxis=dict(title="Oversubscription Ratio"),
showlegend=False,
margin=dict(l=60, r=20, t=50, b=40),
)
apply_plotly_theme(_fig)
# ── Metric cards ──────────────────────────────────────────────────────────
_cards_html = f"""
<div style="display:flex; gap:16px; flex-wrap:wrap; margin:16px 0;">
<div style="padding:18px; border:1px solid #e2e8f0; border-radius:10px;
min-width:175px; text-align:center; background:white;
border-top:3px solid {_status_color};">
<div style="color:#94a3b8; font-size:0.78rem; font-weight:600; margin-bottom:4px;">
Total Bisection BW
</div>
<div style="font-size:1.35rem; font-weight:800; color:{_status_color};
font-family:'SF Mono',monospace;">
{_bw_oversub:,.0f} GB/s
</div>
<div style="font-size:0.72rem; color:#94a3b8; margin-top:4px;">
{_fabric_status}
</div>
</div>
<div style="padding:18px; border:1px solid #e2e8f0; border-radius:10px;
min-width:175px; text-align:center; background:white;
border-top:3px solid {COLORS['BlueLine']};">
<div style="color:#94a3b8; font-size:0.78rem; font-weight:600; margin-bottom:4px;">
Per-GPU BW Share
</div>
<div style="font-size:1.35rem; font-weight:800; color:{COLORS['BlueLine']};
font-family:'SF Mono',monospace;">
{_bw_per_gpu:.2f} GB/s
</div>
<div style="font-size:0.72rem; color:#94a3b8; margin-top:4px;">
of {_link_bw_gbs:.1f} GB/s link speed
</div>
</div>
<div style="padding:18px; border:1px solid #e2e8f0; border-radius:10px;
min-width:175px; text-align:center; background:white;
border-top:3px solid {'#008F45' if _efficiency_pct >= 90 else '#CC5500' if _efficiency_pct >= 60 else '#CB202D'};">
<div style="color:#94a3b8; font-size:0.78rem; font-weight:600; margin-bottom:4px;">
AllReduce Efficiency
</div>
<div style="font-size:1.35rem; font-weight:800;
color:{'#008F45' if _efficiency_pct >= 90 else '#CC5500' if _efficiency_pct >= 60 else '#CB202D'};
font-family:'SF Mono',monospace;">
{_efficiency_pct:.0f}%
</div>
<div style="font-size:0.72rem; color:#94a3b8; margin-top:4px;">
vs non-blocking fabric
</div>
</div>
<div style="padding:18px; border:1px solid #e2e8f0; border-radius:10px;
min-width:175px; text-align:center; background:white;
border-top:3px solid {COLORS['BlueLine']};">
<div style="color:#94a3b8; font-size:0.78rem; font-weight:600; margin-bottom:4px;">
Non-blocking Peak
</div>
<div style="font-size:1.35rem; font-weight:800; color:{COLORS['TextSec']};
font-family:'SF Mono',monospace;">
{_bw_full:,.0f} GB/s
</div>
<div style="font-size:0.72rem; color:#94a3b8; margin-top:4px;">
1:1 fat-tree reference
</div>
</div>
</div>
"""
# ── Physics formula display ───────────────────────────────────────────────
_formula_text = f"""
**Bisection Bandwidth — Live Calculation** (N={_N} GPUs, link={_link_bw_gbs:.1f} GB/s, r={_r:.0f}:1)
```
BW_bisection = N × link_BW / (2 × r)
= {_N} × {_link_bw_gbs:.1f} GB/s / (2 × {_r:.0f})
= {_bw_oversub:,.1f} GB/s ← {_fabric_status}
Non-blocking = {_N} × {_link_bw_gbs:.1f} GB/s / 2
= {_bw_full:,.1f} GB/s ← full fat-tree
AllReduce efficiency = {_bw_oversub:,.1f} / {_bw_full:,.1f}
= {_efficiency_pct:.1f}% ← degradation from oversubscription
```
"""
mo.vstack([
mo.Html(_cards_html),
mo.md(_formula_text),
mo.ui.plotly(_fig),
])
return (
_bw_oversub, _bw_full, _efficiency_pct,
_N, _r, _link_bw_gbs, _link_label,
)
# ─── ACT I: PREDICTION VS REALITY OVERLAY ─────────────────────────────────────
@app.cell(hide_code=True)
def _(mo, act1_prediction, _bw_oversub, _bw_full, _efficiency_pct, _r):
mo.stop(act1_prediction.value is None)
# Prediction-vs-reality overlay
# The 2:1 oversubscription halves bisection BW → exactly option_b
_predicted_pct = {
"option_a": 90.0, # Student predicted oversubscription "rarely matters"
"option_b": 50.0, # Student predicted 50% — correct physics
"option_c": 75.0, # Student predicted latency (hops), not BW reduction
"option_d": 20.0, # Student predicted capacity limit, not oversubscription
}[act1_prediction.value]
_actual_pct = _efficiency_pct
_gap = abs(_actual_pct - _predicted_pct)
_is_correct = act1_prediction.value == "option_b"
if _is_correct:
_overlay = mo.callout(mo.md(
f"**Correct.** Your prediction of {_predicted_pct:.0f}% matches the physics. "
f"The actual AllReduce efficiency for {_r:.0f}:1 oversubscription is "
f"**{_actual_pct:.0f}%** of a non-blocking fabric — "
f"bisection bandwidth is cut by exactly the oversubscription ratio. "
f"There is no partial degradation: the halved spine capacity directly halves "
f"the cross-bisection bandwidth available for AllReduce."
), kind="success")
else:
_overlay = mo.callout(mo.md(
f"**Not quite.** You predicted {_predicted_pct:.0f}% efficiency. "
f"The actual value for {_r:.0f}:1 oversubscription is **{_actual_pct:.0f}%** "
f"— off by {_gap:.0f} percentage points. "
f"The correct answer is B: the 2:1 oversubscription **halves** bisection "
f"bandwidth (from {_bw_full:,.0f} GB/s to {_bw_oversub:,.0f} GB/s). "
f"AllReduce is an all-to-all pattern — its throughput is bounded by bisection "
f"bandwidth, so the degradation is proportional, not incidental."
), kind="warn")
_overlay
return
# ─── ACT I: MATHPEEK ACCORDION ────────────────────────────────────────────────
@app.cell(hide_code=True)
def _(mo, act1_prediction):
mo.stop(act1_prediction.value is None)
mo.accordion({
"The governing equations: Fat-tree bisection bandwidth": mo.md("""
**Bisection Bandwidth Formula (fat-tree):**
```
BW_bisection = N × link_BW / (2 × r)
```
- **N** — Number of end-hosts (GPUs) in the cluster
- **link_BW** — Unidirectional bandwidth of each GPU uplink (GB/s)
- **r** — Oversubscription ratio (1 = non-blocking, 2 = half spine, 4 = quarter spine)
- **2** — Factor of 2 because bisection cuts the network in half;
each half can send at most half the total uplink bandwidth
**Non-blocking condition (r = 1):**
```
BW_bisection_max = N × link_BW / 2
```
A k-ary fat-tree has k/2 spine switches, each with k ports.
Full non-blocking requires as many uplink ports at each layer as downlink ports.
Oversubscription reduces spine uplinks, creating the bottleneck at the bisection cut.
**AllReduce time (ring-allreduce, large N):**
```
T_allreduce ≈ 2 × M / (BW_bisection / N)
= 2 × N × M / BW_bisection
```
- **M** — Message size (gradient tensor, GB)
- The "2" comes from the two phases of ring-allreduce (reduce-scatter + all-gather)
**Scaling behavior by topology:**
| Topology | Bisection BW scales as | Notes |
|----------------|------------------------|------------------------------|
| Full fat-tree | O(N) | Linear — ideal for AllReduce |
| 2:1 fat-tree | O(N/2) | Linear, but halved |
| 3D-torus | O(N^(2/3)) | Sub-linear — poor at scale |
| Ring | O(1) per node | Fixed 2 links per GPU |
"""),
})
return
# ─── ACT I: STRUCTURED REFLECTION ─────────────────────────────────────────────
@app.cell(hide_code=True)
def _(mo, act1_prediction):
mo.stop(act1_prediction.value is None)
mo.md("---")
return
@app.cell(hide_code=True)
def _(mo, act1_prediction):
mo.stop(act1_prediction.value is None)
act1_reflection = mo.ui.radio(
options={
"A) All switches have equal port count — k ports in, k ports out at every layer": "ref_a",
"B) Every leaf-to-leaf path has the same bandwidth as a direct link — no bandwidth bottleneck at any layer": "ref_b",
"C) The tree has exactly 3 layers: edge, aggregation, core": "ref_c",
"D) Maximum path length is log₂(N) hops, minimizing queueing latency": "ref_d",
},
label="""**Reflection — Act I.**
What is the defining property of a non-blocking fat-tree topology?""",
)
act1_reflection
return (act1_reflection,)
@app.cell(hide_code=True)
def _(mo, act1_prediction, act1_reflection):
mo.stop(act1_prediction.value is None)
mo.stop(act1_reflection.value is None, mo.callout(
mo.md("Select your answer to see the explanation."), kind="warn"
))
_ref_correct = act1_reflection.value == "ref_b"
_ref_feedback = {
"ref_a": (
"**Not quite.** Equal port counts is a property of symmetric switches "
"but does not guarantee non-blocking behavior. A switch can have equal "
"ingress and egress ports and still create bandwidth contention if "
"aggregation-layer uplinks are fewer than downlinks — which is exactly "
"what oversubscription does."
),
"ref_b": (
"**Correct.** A non-blocking fat-tree guarantees that any leaf-to-leaf "
"path can sustain full link bandwidth simultaneously with all other "
"leaf-to-leaf paths. This requires that at every layer, the total "
"uplink capacity equals the total downlink capacity. When this holds, "
"no switch layer is ever a bandwidth bottleneck — bisection bandwidth "
"equals N × link_BW / 2, the theoretical maximum."
),
"ref_c": (
"**Not quite.** Fat-trees are commonly depicted with 3 layers "
"(edge / aggregation / core), but this is a specific implementation "
"choice, not the defining property. A fat-tree can have more layers. "
"The defining property is the bandwidth guarantee at every cut, "
"not the number of layers."
),
"ref_d": (
"**Not quite.** O(log N) path length is a property of fat-trees "
"but is not what makes them non-blocking. Latency and bandwidth are "
"independent properties. A topology can have short paths and still "
"be bandwidth-bottlenecked at the bisection if uplink capacity is "
"insufficient."
),
}
mo.callout(
mo.md(_ref_feedback[act1_reflection.value]),
kind="success" if _ref_correct else "warn",
)
return
# ═════════════════════════════════════════════════════════════════════════════
# ACT II — THE 1024-GPU FABRIC DESIGN
# ═════════════════════════════════════════════════════════════════════════════
@app.cell(hide_code=True)
def _(mo, act1_prediction, act1_reflection):
mo.stop(act1_prediction.value is None)
mo.stop(act1_reflection.value is None)
mo.vstack([
mo.Html("""<div style="margin: 24px 0 8px 0;">
<div style="display:flex; align-items:center; gap:12px;">
<div style="background:#CB202D; color:white; border-radius:50%;
width:28px; height:28px; display:inline-flex; align-items:center;
justify-content:center; font-size:0.85rem; font-weight:800;
flex-shrink:0;">II</div>
<div style="flex:1; height:2px; background:#e2e8f0;"></div>
<div style="font-size:0.72rem; font-weight:700; color:#94a3b8;
text-transform:uppercase; letter-spacing:0.12em;">
Act II · 2025 min
</div>
</div>
<div style="font-size:1.6rem; font-weight:800; color:#0f172a;
margin-top:8px; line-height:1.2;">
The 1024-GPU Fabric Design
</div>
<div style="color:#475569; font-size:0.95rem; margin-top:4px;">
$50M hardware budget. Three fabric options. One dominant AllReduce workload.
</div>
</div>"""),
])
return
# ─── ACT II: STAKEHOLDER MESSAGE ──────────────────────────────────────────────
@app.cell(hide_code=True)
def _(mo, act1_prediction, act1_reflection, COLORS):
mo.stop(act1_prediction.value is None)
mo.stop(act1_reflection.value is None)
_color = COLORS["OrangeLine"]
_bg = COLORS["OrangeL"]
mo.Html(f"""
<div style="border-left:4px solid {_color}; background:{_bg};
border-radius:0 10px 10px 0; padding:16px 22px; margin:12px 0;">
<div style="font-size:0.72rem; font-weight:700; color:{_color};
text-transform:uppercase; letter-spacing:0.1em; margin-bottom:6px;">
Incoming Message · Infrastructure VP, Apex Foundation Models
</div>
<div style="font-style:italic; font-size:1.0rem; color:#1e293b; line-height:1.65;">
"We are building a 1024-GPU cluster for large language model pre-training.
Total hardware budget: $50M, with $2025M allocated to compute (GPUs).
I have three fabric proposals on my desk:
(A) 400G InfiniBand NDR fat-tree — $20M, 1:1 non-blocking;
(B) 200G Ethernet fat-tree — $8M, 2:1 oversubscription;
(C) 3D-torus — $5M, fixed bisection per plane.
Our primary workload is Llama-class model training with large AllReduce passes
(32 GB gradient tensors every iteration). Which fabric do I choose?"
</div>
<div style="font-size:0.78rem; color:#475569; margin-top:8px; font-weight:600;">
— Dr. Amara Osei, VP Infrastructure · Apex Foundation Models (1024-GPU cluster)
</div>
</div>
""")
return
# ─── ACT II: CONCEPT SETUP ────────────────────────────────────────────────────
@app.cell(hide_code=True)
def _(mo, act1_prediction, act1_reflection):
mo.stop(act1_prediction.value is None)
mo.stop(act1_reflection.value is None)
mo.vstack([
mo.md("""
## Three Fabrics, One Bottleneck
Each fabric architecture has a different bisection bandwidth model at scale:
**InfiniBand NDR fat-tree (1:1 non-blocking):**
All 1024 GPU uplinks have matching spine capacity. Bisection bandwidth scales
linearly with cluster size. The most expensive option per port, but no contention.
**Ethernet fat-tree (2:1 oversubscribed):**
Half the spine uplinks are removed to reduce cost. Bisection bandwidth is halved.
AllReduce sees only 50% of the per-GPU link speed at the bisection.
**3D-torus:**
Each GPU connects to 6 neighbors in three dimensions. There is no central spine.
Bisection bandwidth cuts one dimension of the torus — for a 1024-GPU cube,
that is a 10×10 plane of links. Bisection bandwidth scales as O(N^(2/3)), not O(N).
```
3D-torus bisection BW = (N)^(2/3) × link_BW
Fat-tree bisection BW = N × link_BW / (2 × r)
```
At small N, the 3D-torus can be competitive. At 1024 GPUs, the fat-tree
advantage is pronounced. At 16,384 GPUs, the torus bisection bandwidth is
roughly **16× lower** per GPU than a non-blocking fat-tree.
"""),
mo.callout(mo.md(
"**AllReduce at scale.** Every iteration of large-model training exchanges "
"gradient tensors across all GPUs. For a 70B-parameter model in FP16, "
"each AllReduce moves ~140 GB of data. The time to complete AllReduce "
"determines whether GPUs are computing or waiting — it is the direct "
"multiplier on total training time."
), kind="info"),
])
return
# ─── ACT II: PREDICTION LOCK ──────────────────────────────────────────────────
@app.cell(hide_code=True)
def _(mo, act1_prediction, act1_reflection):
mo.stop(act1_prediction.value is None)
mo.stop(act1_reflection.value is None)
act2_prediction = mo.ui.radio(
options={
"A) Option C — 3D-torus: cheapest option and bandwidth scales well for mesh workloads": "pred_torus",
"B) Option B — Ethernet 2:1: good enough at 50% of IB performance for 40% of the cost": "pred_eth",
"C) Option A — IB NDR fat-tree: AllReduce is all-to-all, bisection bandwidth is the bottleneck": "pred_ib",
"D) All three topologies perform equally for AllReduce — link speed is all that matters": "pred_equal",
},
label="""**Prediction Lock — Act II.**
For a 1024-GPU cluster running 32 GB AllReduce tensors with a target AllReduce
completion time < 1 second, which fabric design should the Infrastructure VP choose?""",
)
act2_prediction
return (act2_prediction,)
@app.cell(hide_code=True)
def _(mo, act1_prediction, act1_reflection, act2_prediction):
mo.stop(act1_prediction.value is None)
mo.stop(act1_reflection.value is None)
mo.stop(
act2_prediction.value is None,
mo.callout(
mo.md("Select your prediction above to unlock the Fabric Design instruments."),
kind="warn",
)
)
return
# ─── ACT II: INSTRUMENTS ──────────────────────────────────────────────────────
@app.cell(hide_code=True)
def _(mo, act1_prediction, act1_reflection, act2_prediction):
mo.stop(act1_prediction.value is None)
mo.stop(act1_reflection.value is None)
mo.stop(act2_prediction.value is None)
act2_fabric = mo.ui.dropdown(
options={
"IB NDR fat-tree 1:1 (non-blocking)": "ib_fat_tree",
"Ethernet fat-tree 2:1 (oversubscribed)": "eth_fat_tree",
"3D-torus (fixed bisection)": "torus_3d",
},
value="IB NDR fat-tree 1:1 (non-blocking)",
label="Fabric type",
)
act2_cluster_n = mo.ui.slider(
start=64, stop=4096, value=1024, step=64,
label="Cluster size N (GPUs)",
)
act2_model_gb = mo.ui.slider(
start=1, stop=200, value=32, step=1,
label="AllReduce tensor size (GB)",
)
mo.vstack([
mo.md("### Fabric Design Dashboard"),
mo.hstack([act2_fabric, act2_cluster_n], justify="start", gap="2rem"),
act2_model_gb,
])
return (act2_fabric, act2_cluster_n, act2_model_gb)
@app.cell(hide_code=True)
def _(
mo, act1_prediction, act1_reflection, act2_prediction,
act2_fabric, act2_cluster_n, act2_model_gb,
go, apply_plotly_theme, COLORS, math,
IB_NDR400_EFF_GBS, ETH_100G_EFF_GBS,
IB_NDR_FABRIC_COST_M, ETH_400G_FABRIC_COST_M, TORUS_3D_FABRIC_COST_M,
):
mo.stop(act1_prediction.value is None)
mo.stop(act1_reflection.value is None)
mo.stop(act2_prediction.value is None)
_fabric = act2_fabric.value
_N = act2_cluster_n.value
_msg_gb = act2_model_gb.value
# ── Per-fabric bisection bandwidth physics ────────────────────────────────
# Source: @sec-network-fabrics-topology-comparison
if _fabric == "ib_fat_tree":
# IB NDR400 fat-tree, 1:1 non-blocking
# link_BW = 45.0 GB/s per GPU (NDR400 effective unidirectional)
_link_bw_gbs = IB_NDR400_EFF_GBS # 45.0 GB/s
_oversub_ratio = 1.0
_bw_bisect = _N * _link_bw_gbs / (2.0 * _oversub_ratio) # O(N)
_fabric_name = "IB NDR fat-tree (1:1)"
_fabric_color = COLORS["GreenLine"]
_fabric_cost_m = IB_NDR_FABRIC_COST_M # $20M
_topology_note = "Non-blocking · O(N) bisection BW"
elif _fabric == "eth_fat_tree":
# Ethernet 100GbE fat-tree, 2:1 oversubscription
# Using 100GbE at $8M price point (200GbE ports at 2:1 = 100G effective)
_link_bw_gbs = ETH_100G_EFF_GBS * 2.0 # 22.0 GB/s (200GbE effective)
_oversub_ratio = 2.0
_bw_bisect = _N * _link_bw_gbs / (2.0 * _oversub_ratio) # O(N/2)
_fabric_name = "Ethernet fat-tree (2:1)"
_fabric_color = COLORS["OrangeLine"]
_fabric_cost_m = ETH_400G_FABRIC_COST_M # $8M
_topology_note = "2:1 Oversubscribed · O(N/2) bisection BW"
else:
# 3D-torus: bisection BW = N^(2/3) × link_BW per plane
# Each GPU has 6 neighbors (±x, ±y, ±z); bisection cuts one dimension
# Bisection = (N)^(2/3) × link_BW / 2 (cut through a cross-sectional plane)
# Source: @sec-network-fabrics-topology-comparison (torus bisection derivation)
_link_bw_gbs = ETH_100G_EFF_GBS # 11.0 GB/s (cost-equivalent links)
_plane_links = int(_N ** (2.0/3.0)) # number of links crossing bisection
_bw_bisect = _plane_links * _link_bw_gbs / 2.0 # O(N^(2/3))
_fabric_name = "3D-torus"
_fabric_color = COLORS["RedLine"]
_fabric_cost_m = TORUS_3D_FABRIC_COST_M # $5M
_topology_note = f"Fixed mesh · O(N^(2/3)) bisection BW · plane links={_plane_links}"
# ── AllReduce time calculation ─────────────────────────────────────────────
# Ring-allreduce: T = 2 × (N-1)/N × M / (BW_bisection / N)
# For large N → T ≈ 2 × M / (BW_bisection / N) = 2 × N × M / BW_bisection
# Source: @sec-network-fabrics-allreduce (Rabenseifner ring-allreduce analysis)
_bw_per_gpu = _bw_bisect / _N # GB/s per GPU
_factor = 2.0 * (_N - 1) / _N if _N > 1 else 2.0
_allreduce_sec = _factor * _msg_gb / _bw_per_gpu if _bw_per_gpu > 0 else float("inf")
# ── SLA threshold: AllReduce must complete < 1 second ─────────────────────
# Target from stakeholder: AllReduce < 1s for productive training throughput
# Source: @sec-network-fabrics (AllReduce latency target, LLM training)
_SLA_SEC = 1.0
_sla_violated = _allreduce_sec > _SLA_SEC
# ── Training efficiency (fraction of time GPUs are computing, not waiting) ─
# Assume compute time per iteration ≈ 5 seconds for a 70B-parameter model step
# on 1024 GPUs; AllReduce is the synchronization barrier.
# Source: rough model from @sec-network-fabrics (communication overhead)
_COMPUTE_SEC = 5.0
_train_eff_pct = _COMPUTE_SEC / (_COMPUTE_SEC + _allreduce_sec) * 100.0
_eff_color = (
COLORS["GreenLine"] if _train_eff_pct >= 85 else
COLORS["OrangeLine"] if _train_eff_pct >= 60 else
COLORS["RedLine"]
)
# ── Cost-performance ratio ────────────────────────────────────────────────
# How many GB/s of bisection bandwidth per $1M of fabric cost
_bw_per_dollar_m = _bw_bisect / _fabric_cost_m if _fabric_cost_m > 0 else 0.0
# ── Topology comparison across cluster sizes ───────────────────────────────
_n_range = list(range(64, 4097, 64))
_bw_ib_fat = [n * IB_NDR400_EFF_GBS / 2.0 for n in _n_range]
_bw_eth_fat = [n * ETH_100G_EFF_GBS * 2.0 / (2 * 2.0) for n in _n_range]
_bw_torus = [int(n ** (2.0/3.0)) * ETH_100G_EFF_GBS / 2.0 for n in _n_range]
_fig = go.Figure()
_fig.add_trace(go.Scatter(
x=_n_range, y=_bw_ib_fat, mode="lines", name="IB NDR fat-tree (1:1)",
line=dict(color=COLORS["GreenLine"], width=2.5),
))
_fig.add_trace(go.Scatter(
x=_n_range, y=_bw_eth_fat, mode="lines", name="Ethernet fat-tree (2:1)",
line=dict(color=COLORS["OrangeLine"], width=2.5, dash="dash"),
))
_fig.add_trace(go.Scatter(
x=_n_range, y=_bw_torus, mode="lines", name="3D-torus",
line=dict(color=COLORS["RedLine"], width=2.5, dash="dot"),
))
# Mark the current configuration
_fig.add_trace(go.Scatter(
x=[_N], y=[_bw_bisect], mode="markers",
name=f"Current: {_fabric_name}",
marker=dict(symbol="star", size=14, color=_fabric_color,
line=dict(color="white", width=1.5)),
showlegend=True,
))
# SLA line (AllReduce < 1s → minimum bisection BW needed)
# T = 2 × N × M / BW_bisect < 1s → BW_bisect > 2 × N × M
_min_bw_for_sla = [2.0 * n * _msg_gb for n in _n_range]
_fig.add_trace(go.Scatter(
x=_n_range, y=_min_bw_for_sla, mode="lines",
name=f"Min BW for <1s AllReduce ({_msg_gb} GB tensor)",
line=dict(color=COLORS["RedLine"], width=1.5, dash="longdash"),
opacity=0.65,
))
_fig.update_layout(
title=dict(
text="Bisection Bandwidth vs Cluster Size by Topology",
font=dict(size=13, color="#1e293b"), x=0,
),
height=380,
yaxis=dict(title="Total Bisection Bandwidth (GB/s)", gridcolor="#f1f5f9"),
xaxis=dict(title="Cluster Size N (GPUs)"),
legend=dict(orientation="h", y=-0.22, x=0, font=dict(size=10)),
margin=dict(l=60, r=20, t=50, b=100),
)
apply_plotly_theme(_fig)
# ── Cost breakdown bar chart ──────────────────────────────────────────────
_fabrics = ["IB NDR fat-tree\n(1:1)", "Ethernet fat-tree\n(2:1)", "3D-torus"]
_costs_m = [IB_NDR_FABRIC_COST_M, ETH_400G_FABRIC_COST_M, TORUS_3D_FABRIC_COST_M]
_bw_at_1024 = [
1024 * IB_NDR400_EFF_GBS / 2.0,
1024 * ETH_100G_EFF_GBS * 2.0 / (2 * 2.0),
int(1024 ** (2.0/3.0)) * ETH_100G_EFF_GBS / 2.0,
]
_ar_times = [
2.0 * _msg_gb / (bw / 1024) for bw in _bw_at_1024
]
_cost_colors = [COLORS["GreenLine"], COLORS["OrangeLine"], COLORS["RedLine"]]
_selected_fab_idx = ["ib_fat_tree", "eth_fat_tree", "torus_3d"].index(_fabric)
_fig2 = go.Figure()
_fig2.add_trace(go.Bar(
x=_fabrics,
y=_costs_m,
marker_color=[
c if i == _selected_fab_idx else c
for i, c in enumerate(_cost_colors)
],
marker_line_color=["white"] * 3,
marker_line_width=[0] * 3,
opacity=[1.0 if i == _selected_fab_idx else 0.45 for i in range(3)],
text=[f"${c:.0f}M" for c in _costs_m],
textposition="outside",
textfont=dict(size=11, family="SF Mono, monospace"),
width=0.55,
name="Fabric cost ($M)",
))
# Overlay: AllReduce time as text annotation
for _i, (_fab, _ar_t) in enumerate(zip(_fabrics, _ar_times)):
_ar_color = "#008F45" if _ar_t < 1.0 else "#CB202D"
_fig2.add_annotation(
x=_fab, y=_costs_m[_i] + 0.5,
text=f"AllReduce: {_ar_t:.2f}s",
showarrow=False,
font=dict(size=9, color=_ar_color, family="SF Mono, monospace"),
)
_fig2.update_layout(
title=dict(
text=f"Fabric Cost vs AllReduce Time — 1024 GPUs, {_msg_gb} GB tensor",
font=dict(size=13, color="#1e293b"), x=0,
),
height=320,
yaxis=dict(title="Fabric Cost ($M)", gridcolor="#f1f5f9"),
xaxis=dict(title="Fabric Type"),
showlegend=False,
margin=dict(l=60, r=20, t=50, b=40),
)
apply_plotly_theme(_fig2)
# ── Physics formula display ───────────────────────────────────────────────
if _fabric == "torus_3d":
_plane_links_disp = int(_N ** (2.0/3.0))
_formula_text = f"""
**Bisection Bandwidth — 3D-torus** (N={_N}, link={_link_bw_gbs:.1f} GB/s)
```
Bisection links = N^(2/3) = {_N}^(2/3) = {_plane_links_disp} links crossing bisection plane
BW_bisection = {_plane_links_disp} × {_link_bw_gbs:.1f} GB/s / 2 = {_bw_bisect:,.1f} GB/s
BW per GPU = {_bw_bisect:,.1f} / {_N} = {_bw_per_gpu:.3f} GB/s
AllReduce time ≈ 2 × {_msg_gb} GB / {_bw_per_gpu:.3f} GB/s = {_allreduce_sec:.2f} s
```
**Scaling penalty at {_N} GPUs vs IB NDR fat-tree:**
```
IB fat-tree BW = {_N} × {IB_NDR400_EFF_GBS:.1f} / 2 = {_N * IB_NDR400_EFF_GBS / 2:.0f} GB/s
3D-torus BW = {_bw_bisect:,.1f} GB/s
Ratio = {(_N * IB_NDR400_EFF_GBS / 2) / _bw_bisect:.1f}× lower bisection BW
```
"""
else:
_formula_text = f"""
**Bisection Bandwidth — {_fabric_name}** (N={_N}, link={_link_bw_gbs:.1f} GB/s, r={_oversub_ratio:.0f}:1)
```
BW_bisection = N × link_BW / (2 × r)
= {_N} × {_link_bw_gbs:.1f} GB/s / (2 × {_oversub_ratio:.0f})
= {_bw_bisect:,.1f} GB/s ← {_topology_note}
BW per GPU = {_bw_bisect:,.1f} / {_N} = {_bw_per_gpu:.3f} GB/s
AllReduce ≈ 2 × (N-1)/N × M / (BW/N)
≈ 2 × {_msg_gb} GB / {_bw_per_gpu:.3f} GB/s
= {_allreduce_sec:.2f} s
```
"""
# ── Metric cards ──────────────────────────────────────────────────────────
_ar_color = COLORS["GreenLine"] if not _sla_violated else COLORS["RedLine"]
_cards_html = f"""
<div style="display:flex; gap:16px; flex-wrap:wrap; margin:16px 0;">
<div style="padding:18px; border:1px solid #e2e8f0; border-radius:10px;
min-width:175px; text-align:center; background:white;
border-top:3px solid {_fabric_color};">
<div style="color:#94a3b8; font-size:0.78rem; font-weight:600; margin-bottom:4px;">
Bisection BW
</div>
<div style="font-size:1.35rem; font-weight:800; color:{_fabric_color};
font-family:'SF Mono',monospace;">
{_bw_bisect:,.0f} GB/s
</div>
<div style="font-size:0.72rem; color:#94a3b8; margin-top:4px;">
{_topology_note.split(' · ')[0]}
</div>
</div>
<div style="padding:18px; border:1px solid #e2e8f0; border-radius:10px;
min-width:175px; text-align:center; background:white;
border-top:3px solid {_ar_color};">
<div style="color:#94a3b8; font-size:0.78rem; font-weight:600; margin-bottom:4px;">
AllReduce Time
</div>
<div style="font-size:1.35rem; font-weight:800; color:{_ar_color};
font-family:'SF Mono',monospace;">
{_allreduce_sec:.2f} s
</div>
<div style="font-size:0.72rem; color:#94a3b8; margin-top:4px;">
target: &lt; {_SLA_SEC:.0f} s
</div>
</div>
<div style="padding:18px; border:1px solid #e2e8f0; border-radius:10px;
min-width:175px; text-align:center; background:white;
border-top:3px solid {_eff_color};">
<div style="color:#94a3b8; font-size:0.78rem; font-weight:600; margin-bottom:4px;">
Training Efficiency
</div>
<div style="font-size:1.35rem; font-weight:800; color:{_eff_color};
font-family:'SF Mono',monospace;">
{_train_eff_pct:.1f}%
</div>
<div style="font-size:0.72rem; color:#94a3b8; margin-top:4px;">
GPU compute fraction
</div>
</div>
<div style="padding:18px; border:1px solid #e2e8f0; border-radius:10px;
min-width:175px; text-align:center; background:white;
border-top:3px solid {COLORS['BlueLine']};">
<div style="color:#94a3b8; font-size:0.78rem; font-weight:600; margin-bottom:4px;">
Fabric Cost
</div>
<div style="font-size:1.35rem; font-weight:800; color:{COLORS['TextSec']};
font-family:'SF Mono',monospace;">
${_fabric_cost_m:.0f}M
</div>
<div style="font-size:0.72rem; color:#94a3b8; margin-top:4px;">
{_bw_per_dollar_m:,.0f} GB/s per $1M
</div>
</div>
</div>
"""
mo.vstack([
mo.Html(_cards_html),
mo.md(_formula_text),
mo.ui.plotly(_fig),
mo.ui.plotly(_fig2),
])
return (
_allreduce_sec, _bw_bisect, _sla_violated,
_fabric_name, _train_eff_pct, _fabric_cost_m,
)
# ─── ACT II: FAILURE STATE ────────────────────────────────────────────────────
@app.cell(hide_code=True)
def _(mo, act1_prediction, act1_reflection, act2_prediction,
_allreduce_sec, _sla_violated, _fabric_name):
mo.stop(act1_prediction.value is None)
mo.stop(act1_reflection.value is None)
mo.stop(act2_prediction.value is None)
if _sla_violated:
mo.callout(mo.md(
f"**Bisection bandwidth bottleneck — AllReduce SLA violated.** "
f"AllReduce requires **{_allreduce_sec:.2f} s** on **{_fabric_name}**. "
f"This exceeds the 1-second target. "
f"At this AllReduce time, GPUs spend more time waiting on gradient "
f"synchronization than computing. Training throughput is severely degraded. "
f"Reduce the tensor size, increase bisection bandwidth (lower oversubscription "
f"or switch fabric), or increase cluster link speed to meet the SLA."
), kind="danger")
else:
mo.callout(mo.md(
f"**AllReduce SLA met.** "
f"{_fabric_name} delivers AllReduce in **{_allreduce_sec:.2f} s** "
f"— within the 1-second target. GPUs remain productive."
), kind="success")
return
# ─── ACT II: PREDICTION FEEDBACK ──────────────────────────────────────────────
@app.cell(hide_code=True)
def _(mo, act1_prediction, act1_reflection, act2_prediction,
_allreduce_sec, _bw_bisect, _fabric_cost_m):
mo.stop(act1_prediction.value is None)
mo.stop(act1_reflection.value is None)
mo.stop(act2_prediction.value is None)
_act2_correct = act2_prediction.value == "pred_ib"
_act2_feedback = {
"pred_torus": (
"**Not quite.** The 3D-torus is the cheapest option, but its bisection "
"bandwidth scales as O(N^(2/3)). At 1024 GPUs, the torus bisection "
"bandwidth is roughly 10× lower than a non-blocking fat-tree. "
"AllReduce on a 1024-GPU torus will take several seconds per iteration — "
"far exceeding the 1-second target. The cost saving disappears when "
"GPU utilization drops below 50%."
),
"pred_eth": (
"**Not quite.** The 2:1 Ethernet fat-tree halves bisection bandwidth "
"compared to the IB non-blocking option. For large AllReduce tensors, "
"this 2× reduction in bisection bandwidth doubles AllReduce time. "
"At 32 GB tensor size with a 1024-GPU cluster, the Ethernet 2:1 fabric "
"likely violates the 1-second SLA. The $12M saved on fabric is lost "
"in reduced GPU utilization and extended training time."
),
"pred_ib": (
f"**Correct.** For an all-to-all AllReduce workload, bisection bandwidth "
f"is the bottleneck — not per-link speed or switch count. "
f"The IB NDR fat-tree with 1:1 non-blocking provides the highest bisection "
f"bandwidth at scale, completing AllReduce in {_allreduce_sec:.2f} s "
f"on the current configuration. The $20M fabric cost is justified: "
f"a cluster with 1024 GPUs at $25,000 each represents $25.6M in compute. "
f"A fabric that wastes 50% of compute time with slow AllReduce "
f"effectively discards $12M in GPU capacity."
),
"pred_equal": (
"**Not quite.** All three topologies perform very differently for AllReduce. "
"Link speed matters only at the local level — bisection bandwidth determines "
"the cross-cluster communication ceiling for all-to-all patterns. "
"A fat-tree and a 3D-torus with the same link speed differ by an order of "
"magnitude in AllReduce time at 1024 GPU scale because their bisection "
"bandwidths scale differently with N."
),
}
mo.callout(
mo.md(_act2_feedback[act2_prediction.value]),
kind="success" if _act2_correct else "warn",
)
return
# ─── ACT II: MATHPEEK ACCORDION ───────────────────────────────────────────────
@app.cell(hide_code=True)
def _(mo, act1_prediction, act1_reflection, act2_prediction):
mo.stop(act1_prediction.value is None)
mo.stop(act1_reflection.value is None)
mo.stop(act2_prediction.value is None)
mo.accordion({
"The governing equations: AllReduce time and topology scaling": mo.md("""
**Ring-AllReduce time (Rabenseifner's algorithm):**
```
T_allreduce = 2 × (N - 1) / N × M / BW_per_gpu
≈ 2 × M / BW_per_gpu for large N
where BW_per_gpu = BW_bisection / N
```
- **M** — AllReduce message size (gradient tensor, GB)
- **N** — Number of GPUs
- **BW_per_gpu** — Each GPU's share of bisection bandwidth
- Factor **2** — Reduce-scatter phase + all-gather phase
Substituting bisection bandwidth for each topology:
```
Fat-tree (r oversubscription):
BW_bisection = N × link_BW / (2 × r)
T_allreduce ≈ 2 × M × 2 × r / link_BW
= 4 × r × M / link_BW ← independent of N!
3D-torus:
BW_bisection = N^(2/3) × link_BW / 2
BW_per_gpu = N^(2/3) × link_BW / (2 × N) = link_BW / (2 × N^(1/3))
T_allreduce ≈ 4 × M × N^(1/3) / link_BW ← grows as N^(1/3)!
```
**The critical insight:**
For fat-tree, AllReduce time is **independent of cluster size N** (assuming
fixed link_BW and oversubscription). Every GPU added brings its own uplink,
keeping per-GPU bandwidth constant.
For 3D-torus, AllReduce time grows as **N^(1/3)** — at 8,000 GPUs, it is
20× slower than at 1 GPU on the same link speed.
**Cost-performance trade-off:**
| Topology | AllReduce scaling | Cost (1024 GPU) | BW/dollar |
|------------------|-------------------|-----------------|-----------|
| IB NDR fat-tree | O(1) | $20M | high |
| Eth fat-tree 2:1 | O(1) but 2× slower| $8M | medium |
| 3D-torus | O(N^(1/3)) | $5M | low at scale |
The torus is cost-efficient only when clusters are small enough that
N^(1/3) × link_BW still meets the latency target.
"""),
})
return
# ─── ACT II: STRUCTURED REFLECTION ───────────────────────────────────────────
@app.cell(hide_code=True)
def _(mo, act1_prediction, act1_reflection, act2_prediction):
mo.stop(act1_prediction.value is None)
mo.stop(act1_reflection.value is None)
mo.stop(act2_prediction.value is None)
mo.md("---")
return
@app.cell(hide_code=True)
def _(mo, act1_prediction, act1_reflection, act2_prediction):
mo.stop(act1_prediction.value is None)
mo.stop(act1_reflection.value is None)
mo.stop(act2_prediction.value is None)
act2_reflection = mo.ui.radio(
options={
"A) 3D-torus has higher latency than fat-tree because of more hops between non-adjacent nodes": "r2_a",
"B) Torus bisection bandwidth scales as O(N^(2/3)) vs fat-tree O(N) — all-to-all traffic is bottlenecked by the plane boundaries": "r2_b",
"C) 3D-torus requires special AllReduce algorithms that are less efficient than ring-allreduce": "r2_c",
"D) 3D-torus topologies cannot support more than 512 GPUs due to diameter limits": "r2_d",
},
label="""**Reflection — Act II.**
Why do 3D-torus topologies perform poorly for AllReduce at large cluster scales?""",
)
act2_reflection
return (act2_reflection,)
@app.cell(hide_code=True)
def _(mo, act1_prediction, act1_reflection, act2_prediction, act2_reflection):
mo.stop(act1_prediction.value is None)
mo.stop(act1_reflection.value is None)
mo.stop(act2_prediction.value is None)
mo.stop(act2_reflection.value is None, mo.callout(
mo.md("Select your answer to see the explanation."), kind="warn"
))
_r2_correct = act2_reflection.value == "r2_b"
_r2_feedback = {
"r2_a": (
"**Not quite.** Latency (hop count) is a separate property from bandwidth. "
"A 3D-torus and a fat-tree can have similar average path lengths at the same "
"cluster size. The AllReduce degradation is purely a bandwidth phenomenon: "
"all-to-all traffic must cross the bisection, and the torus has fewer links "
"at the bisection plane than the fat-tree. More hops contribute microseconds; "
"inadequate bisection bandwidth contributes seconds."
),
"r2_b": (
"**Correct.** The torus bisection bandwidth grows as O(N^(2/3)) because "
"the bisection cut passes through a cross-sectional plane of the torus — "
"a 2D slice whose size grows as the square of the linear dimension. "
"For a cube-shaped 3D-torus of N GPUs, the bisection has N^(2/3) links. "
"Fat-tree bisection grows as O(N) because every GPU brings its own uplink "
"to the spine. At 1024 GPUs, the fat-tree has 10× the bisection bandwidth "
"of the torus per unit of link speed."
),
"r2_c": (
"**Not quite.** Standard ring-allreduce operates on any topology. "
"The algorithm itself is topology-agnostic — it works by routing gradients "
"around a logical ring of GPUs. The 3D-torus does not require special "
"AllReduce algorithms. The performance difference is purely a function of "
"available bisection bandwidth, not algorithmic efficiency."
),
"r2_d": (
"**Not quite.** 3D-torus topologies are used in some large HPC installations "
"at thousands of nodes (e.g., early Blue Gene systems used 3D/5D torus). "
"The problem is not a hard capacity limit but a scaling law: AllReduce time "
"grows as N^(1/3) on a torus. At 4096 GPUs, this produces AllReduce times "
"that are 4× slower than at 512 GPUs on the same link speed — making the "
"topology progressively worse for ML training as clusters grow."
),
}
mo.callout(
mo.md(_r2_feedback[act2_reflection.value]),
kind="success" if _r2_correct else "warn",
)
return
# ─── LEDGER SAVE + HUD ────────────────────────────────────────────────────────
@app.cell(hide_code=True)
def _(
mo, ledger, COLORS,
context_toggle,
act1_prediction, act1_reflection,
act2_prediction, act2_reflection,
act2_fabric, act2_cluster_n,
_bw_bisect, _allreduce_sec, _sla_violated, _fabric_name, _fabric_cost_m,
):
# Only save when both acts are complete
_acts_complete = (
act1_prediction.value is not None and
act1_reflection.value is not None and
act2_prediction.value is not None and
act2_reflection.value is not None
)
if _acts_complete:
ledger.save(
chapter="v2_03",
design={
"context": context_toggle.value,
"fabric_type": act2_fabric.value,
"cluster_size": act2_cluster_n.value,
"oversubscription": 2.0 if act2_fabric.value == "eth_fat_tree" else (1.0 if act2_fabric.value == "ib_fat_tree" else 0.0),
"bisection_bw_gbps": round(_bw_bisect, 1),
"act1_prediction": act1_prediction.value,
"act1_correct": act1_prediction.value == "option_b",
"act2_result": round(_allreduce_sec, 3),
"act2_decision": act2_fabric.value,
"constraint_hit": _sla_violated,
},
)
# ── HUD footer ────────────────────────────────────────────────────────────
_act1_status = act1_prediction.value is not None
_act2_status = act2_prediction.value is not None
_act1_correct = act1_prediction.value == "option_b"
_act2_correct = act2_prediction.value == "pred_ib"
_hud_items = [
("Lab", "Vol II · Lab 03", "hud-value"),
("Context", context_toggle.value, "hud-value"),
("Act I", "Correct" if _act1_correct else ("Answered" if _act1_status else "Pending"),
"hud-active" if _act1_correct else ("hud-value" if _act1_status else "hud-none")),
("Act II", "Correct" if _act2_correct else ("Answered" if _act2_status else "Pending"),
"hud-active" if _act2_correct else ("hud-value" if _act2_status else "hud-none")),
("Bisection BW", f"{_bw_bisect:,.0f} GB/s" if _act2_status else "",
"hud-value"),
("AllReduce", f"{_allreduce_sec:.2f} s" if _act2_status else "",
"hud-active" if (_act2_status and not _sla_violated) else ("hud-none" if (_act2_status and _sla_violated) else "hud-value")),
("SLA", "Violated" if _sla_violated else ("Met" if _act2_status else ""),
"hud-none" if _sla_violated else ("hud-active" if _act2_status else "hud-value")),
("Ledger", "Saved" if _acts_complete else "Incomplete",
"hud-active" if _acts_complete else "hud-none"),
]
_hud_cells = "".join([
f"""<div style="display:flex; flex-direction:column; gap:2px;">
<span class="hud-label">{label}</span>
<span class="{cls}">{val}</span>
</div>"""
for label, val, cls in _hud_items
])
mo.Html(f"""
<div class="lab-hud" style="margin-top:32px;">
{_hud_cells}
</div>
""")
return
if __name__ == "__main__":
app.run()