mirror of
https://github.com/harvard-edge/cs249r_book.git
synced 2026-03-09 07:15:51 -05:00
Add all Vol1 (labs 01-16) and Vol2 (labs 01-17) interactive Marimo labs as the first full first-pass implementation of the ML Systems curriculum labs. Each lab follows the PROTOCOL 2-Act structure (35-40 min): - Act I: Calibration with prediction lock → instruments → overlay - Act II: Design challenge with failure states and reflection Key pedagogical instruments introduced progressively: - Vol1: D·A·M Triad, Iron Law, Memory Ledger, Roofline, Amdahl's Law, Little's Law, P99 Histogram, Compression Frontier, Chouldechova theorem - Vol2: NVLink vs PCIe cliff, Bisection BW, Young-Daly T*, Parallelism Paradox, AllReduce ring vs tree, KV-cache model, Jevons Paradox, DP ε-δ tradeoff, SLO composition, Adversarial Pareto, two-volume synthesis capstone All 35 staged files pass AST syntax verification (36/36 including lab_00). Also includes: - labs/LABS_SPEC.md: authoritative sub-agent brief for all lab conventions - labs/core/style.py: expanded unified design system with semantic color tokens
1561 lines
70 KiB
Python
1561 lines
70 KiB
Python
import marimo
|
||
|
||
__generated_with = "0.19.6"
|
||
app = marimo.App(width="full")
|
||
|
||
# ─────────────────────────────────────────────────────────────────────────────
|
||
# LAB 03: THE BISECTION BANDWIDTH WALL
|
||
#
|
||
# Chapter: network_fabrics.qmd (@sec-network-fabrics)
|
||
# Volume II · Lab 03
|
||
#
|
||
# Core Invariant: Fat-tree topology provides full bisection bandwidth.
|
||
# As clusters scale, bisection bandwidth per GPU decreases unless the fabric
|
||
# is properly overprovisioned. The bisection bandwidth wall is why not all
|
||
# topologies scale equally.
|
||
#
|
||
# Two Acts:
|
||
# Act I — The Bisection Bandwidth Blindspot (12–15 min)
|
||
# A 128-GPU cluster uses 2:1 oversubscription. AllReduce is 40% of expected.
|
||
# Why? Bisection bandwidth halved by oversubscription explains the drop.
|
||
#
|
||
# Act II — The 1024-GPU Fabric Design (20–25 min)
|
||
# $50M budget, 3 fabric options: IB fat-tree, Eth fat-tree, 3D-torus.
|
||
# AllReduce at scale is dominated by bisection bandwidth, not per-link speed.
|
||
# Failure state: bisection BW insufficient → AllReduce > 1-second target.
|
||
#
|
||
# Contexts: 8-GPU cluster (single-node reference) vs 1024-GPU cluster (scale)
|
||
#
|
||
# Design Ledger: chapter="v2_03", context, fabric_type, cluster_size,
|
||
# oversubscription, bisection_bw_gbps, act1_prediction,
|
||
# act1_correct, act2_result, act2_decision, constraint_hit
|
||
#
|
||
# Hardware constants (source in comments on each constant):
|
||
# IB_HDR200_PORT_GBPS = 200 # InfiniBand HDR200 single port, Gb/s
|
||
# IB_NDR400_PORT_GBPS = 400 # InfiniBand NDR400 single port, Gb/s
|
||
# ETH_100G_PORT_GBPS = 100 # 100GbE port line rate, Gb/s
|
||
# ETH_400G_PORT_GBPS = 400 # 400GbE port line rate, Gb/s
|
||
# IB_HDR200_EFF_GBS = 22.5 # InfiniBand HDR200 effective unidirectional GB/s
|
||
# IB_NDR400_EFF_GBS = 45.0 # InfiniBand NDR400 effective unidirectional GB/s
|
||
# ETH_100G_EFF_GBS = 11.0 # 100GbE effective GB/s after TCP/IP overhead
|
||
# ETH_400G_EFF_GBS = 44.0 # 400GbE effective GB/s after overhead
|
||
# ─────────────────────────────────────────────────────────────────────────────
|
||
|
||
|
||
# ─── CELL 0: SETUP (hide_code=False — leave visible for instructor inspection) ─
|
||
|
||
|
||
@app.cell
|
||
def _():
|
||
import marimo as mo
|
||
import sys
|
||
import math
|
||
from pathlib import Path
|
||
import plotly.graph_objects as go
|
||
import numpy as np
|
||
|
||
_root = Path(__file__).resolve().parents[2]
|
||
if str(_root) not in sys.path:
|
||
sys.path.insert(0, str(_root))
|
||
|
||
from labs.core.state import DesignLedger
|
||
from labs.core.style import COLORS, LAB_CSS, apply_plotly_theme
|
||
|
||
# ── Hardware constants (source: vendor datasheets + @sec-network-fabrics) ──
|
||
|
||
# InfiniBand port rates — NVIDIA/Mellanox datasheet
|
||
IB_HDR200_PORT_GBPS = 200 # Gb/s per port, InfiniBand HDR200 (HDR)
|
||
IB_NDR400_PORT_GBPS = 400 # Gb/s per port, InfiniBand NDR (NDR400)
|
||
|
||
# Ethernet port rates — IEEE 802.3 standards
|
||
ETH_100G_PORT_GBPS = 100 # Gb/s per port, 100GbE
|
||
ETH_400G_PORT_GBPS = 400 # Gb/s per port, 400GbE
|
||
|
||
# Effective unidirectional bandwidth (Gb/s → GB/s with protocol overhead)
|
||
# InfiniBand: ~90% efficiency (minimal overhead), source: MLPerf network BW
|
||
IB_HDR200_EFF_GBS = 22.5 # GB/s unidirectional (200 Gbps × 0.9 / 8)
|
||
IB_NDR400_EFF_GBS = 45.0 # GB/s unidirectional (400 Gbps × 0.9 / 8)
|
||
|
||
# Ethernet: ~88% efficiency after TCP/UDP overhead
|
||
ETH_100G_EFF_GBS = 11.0 # GB/s unidirectional (100 Gbps × 0.88 / 8)
|
||
ETH_400G_EFF_GBS = 44.0 # GB/s unidirectional (400 Gbps × 0.88 / 8)
|
||
|
||
# Cost estimates for 1024-GPU fabric (Act II), source: industry white papers
|
||
# These are order-of-magnitude estimates for pedagogical framing
|
||
IB_NDR_FABRIC_COST_M = 20.0 # $M for 1024-GPU IB NDR fat-tree, non-blocking
|
||
ETH_400G_FABRIC_COST_M = 8.0 # $M for 1024-GPU 400GbE fat-tree, 2:1 oversub
|
||
TORUS_3D_FABRIC_COST_M = 5.0 # $M for 1024-GPU 3D-torus (fixed links)
|
||
|
||
# AllReduce ring-allreduce formula constant
|
||
# Ring AllReduce time = 2 * (N-1)/N * message_size / bisection_bw
|
||
# For large N: approaches 2 * message_size / bisection_bw
|
||
# Source: @sec-network-fabrics-allreduce-algorithms (Rabenseifner's algorithm)
|
||
|
||
ledger = DesignLedger()
|
||
|
||
return (
|
||
mo, ledger, COLORS, LAB_CSS, apply_plotly_theme,
|
||
go, np, math,
|
||
IB_HDR200_PORT_GBPS, IB_NDR400_PORT_GBPS,
|
||
ETH_100G_PORT_GBPS, ETH_400G_PORT_GBPS,
|
||
IB_HDR200_EFF_GBS, IB_NDR400_EFF_GBS,
|
||
ETH_100G_EFF_GBS, ETH_400G_EFF_GBS,
|
||
IB_NDR_FABRIC_COST_M, ETH_400G_FABRIC_COST_M, TORUS_3D_FABRIC_COST_M,
|
||
)
|
||
|
||
|
||
# ─── CELL 1: HEADER ───────────────────────────────────────────────────────────
|
||
|
||
|
||
@app.cell(hide_code=True)
|
||
def _(mo, LAB_CSS, COLORS):
|
||
mo.vstack([
|
||
LAB_CSS,
|
||
mo.Html(f"""
|
||
<div style="background: linear-gradient(135deg, #0f172a 0%, #1e293b 60%, #0c1a2e 100%);
|
||
padding: 36px 44px; border-radius: 16px; color: white;
|
||
box-shadow: 0 8px 32px rgba(0,0,0,0.35);">
|
||
<div style="font-size: 0.72rem; font-weight: 700; letter-spacing: 0.18em;
|
||
color: #475569; text-transform: uppercase; margin-bottom: 10px;">
|
||
Machine Learning Systems · Volume II · Lab 03
|
||
</div>
|
||
<h1 style="margin: 0 0 10px 0; font-size: 2.4rem; font-weight: 900;
|
||
color: #f8fafc; line-height: 1.1; letter-spacing: -0.02em;">
|
||
The Bisection Bandwidth Wall
|
||
</h1>
|
||
<p style="margin: 0 0 6px 0; font-size: 1.05rem; font-weight: 600;
|
||
color: #94a3b8; letter-spacing: 0.04em; font-family: 'SF Mono', monospace;">
|
||
BW_bisection = N × link_BW / (2 × oversubscription_ratio)
|
||
</p>
|
||
<p style="margin: 0 0 22px 0; font-size: 1.0rem; color: #64748b;
|
||
max-width: 700px; line-height: 1.65;">
|
||
Fat-tree topology promises full bisection bandwidth — every GPU reaches
|
||
every other GPU at full link speed. That promise holds only when the fabric
|
||
is non-blocking. Oversubscription quietly halves it. At 1024 GPUs, the choice
|
||
of topology determines whether AllReduce finishes in 200 ms or 4 seconds.
|
||
</p>
|
||
<div style="display: flex; gap: 12px; flex-wrap: wrap; margin-bottom: 20px;">
|
||
<span style="background: rgba(99,102,241,0.18); color: #a5b4fc;
|
||
padding: 5px 14px; border-radius: 20px; font-size: 0.8rem;
|
||
font-weight: 600; border: 1px solid rgba(99,102,241,0.3);">
|
||
2 Acts · 35–40 min
|
||
</span>
|
||
<span style="background: rgba(203,32,45,0.15); color: #fca5a5;
|
||
padding: 5px 14px; border-radius: 20px; font-size: 0.8rem;
|
||
font-weight: 600; border: 1px solid rgba(203,32,45,0.25);">
|
||
Chapter: @sec-network-fabrics
|
||
</span>
|
||
<span style="background: rgba(0,143,69,0.15); color: #6ee7b7;
|
||
padding: 5px 14px; border-radius: 20px; font-size: 0.8rem;
|
||
font-weight: 600; border: 1px solid rgba(0,143,69,0.25);">
|
||
Instrument: Bisection BW Calculator
|
||
</span>
|
||
</div>
|
||
<div style="display: flex; gap: 10px; flex-wrap: wrap;">
|
||
<span class="badge badge-info">Bisection Bandwidth</span>
|
||
<span class="badge badge-warn">Oversubscription</span>
|
||
<span class="badge badge-fail">AllReduce Bottleneck</span>
|
||
<span class="badge badge-ok">Fat-Tree Non-Blocking</span>
|
||
</div>
|
||
</div>
|
||
"""),
|
||
])
|
||
return
|
||
|
||
|
||
# ─── CELL 2: RECOMMENDED READING ──────────────────────────────────────────────
|
||
|
||
|
||
@app.cell(hide_code=True)
|
||
def _(mo):
|
||
mo.callout(mo.md("""
|
||
**Recommended Reading** — Complete the following before this lab:
|
||
|
||
- **@sec-network-fabrics-bisection-bandwidth** (Bisection Bandwidth) — The minimum-cut
|
||
definition, why it sets the all-to-all communication ceiling, and the fat-tree formula.
|
||
- **@sec-network-fabrics-fat-tree-topology** (Fat-Tree Structure) — k-ary fat-tree
|
||
construction, oversubscription ratios, and the non-blocking guarantee.
|
||
- **@sec-network-fabrics-allreduce** (AllReduce Algorithms) — Ring-AllReduce bandwidth
|
||
analysis and why bisection bandwidth is the bottleneck for large clusters.
|
||
- **@sec-network-fabrics-topology-comparison** (Topology Trade-offs) — Fat-tree vs
|
||
3D-torus bisection bandwidth scaling: O(N) vs O(N^(2/3)).
|
||
"""), kind="info")
|
||
return
|
||
|
||
|
||
# ─── CELL 3: CONTEXT TOGGLE ───────────────────────────────────────────────────
|
||
|
||
|
||
@app.cell(hide_code=True)
|
||
def _(mo):
|
||
context_toggle = mo.ui.radio(
|
||
options={
|
||
"8-GPU Cluster (single-node reference scale)": "small",
|
||
"1024-GPU Cluster (production training scale)": "large",
|
||
},
|
||
value="8-GPU Cluster (single-node reference scale)",
|
||
label="Cluster context (sets scale for all computations):",
|
||
inline=True,
|
||
)
|
||
context_toggle
|
||
return (context_toggle,)
|
||
|
||
|
||
# ═════════════════════════════════════════════════════════════════════════════
|
||
# ACT I — THE BISECTION BANDWIDTH BLINDSPOT
|
||
# ═════════════════════════════════════════════════════════════════════════════
|
||
|
||
|
||
@app.cell(hide_code=True)
|
||
def _(mo):
|
||
mo.vstack([
|
||
mo.Html("""<div style="margin: 24px 0 8px 0;">
|
||
<div style="display:flex; align-items:center; gap:12px;">
|
||
<div style="background:#006395; color:white; border-radius:50%;
|
||
width:28px; height:28px; display:inline-flex; align-items:center;
|
||
justify-content:center; font-size:0.85rem; font-weight:800;
|
||
flex-shrink:0;">I</div>
|
||
<div style="flex:1; height:2px; background:#e2e8f0;"></div>
|
||
<div style="font-size:0.72rem; font-weight:700; color:#94a3b8;
|
||
text-transform:uppercase; letter-spacing:0.12em;">
|
||
Act I · 12–15 min
|
||
</div>
|
||
</div>
|
||
<div style="font-size:1.6rem; font-weight:800; color:#0f172a;
|
||
margin-top:8px; line-height:1.2;">
|
||
The Bisection Bandwidth Blindspot
|
||
</div>
|
||
<div style="color:#475569; font-size:0.95rem; margin-top:4px;">
|
||
Why is AllReduce running at 40% of expected throughput on our new cluster?
|
||
</div>
|
||
</div>"""),
|
||
])
|
||
return
|
||
|
||
|
||
# ─── ACT I: STAKEHOLDER MESSAGE ───────────────────────────────────────────────
|
||
|
||
|
||
@app.cell(hide_code=True)
|
||
def _(mo, COLORS):
|
||
_color = COLORS["BlueLine"]
|
||
_bg = COLORS["BlueL"]
|
||
mo.Html(f"""
|
||
<div style="border-left:4px solid {_color}; background:{_bg};
|
||
border-radius:0 10px 10px 0; padding:16px 22px; margin:12px 0;">
|
||
<div style="font-size:0.72rem; font-weight:700; color:{_color};
|
||
text-transform:uppercase; letter-spacing:0.1em; margin-bottom:6px;">
|
||
Incoming Message · Network Architect, Meridian AI Infrastructure
|
||
</div>
|
||
<div style="font-style:italic; font-size:1.0rem; color:#1e293b; line-height:1.65;">
|
||
"We commissioned a new 128-GPU training cluster. To reduce fabric cost we used
|
||
a 2:1 oversubscription on the spine switches — half the uplinks of a full fat-tree.
|
||
The vendor assured us this is standard for most workloads. But our distributed
|
||
training AllReduce is running at 40% of the throughput we calculated. The GPUs are
|
||
idle 60% of the time waiting on gradients. What did we get wrong?"
|
||
</div>
|
||
<div style="font-size:0.78rem; color:#475569; margin-top:8px; font-weight:600;">
|
||
— Kenji Watanabe, Network Architect · Meridian AI Infrastructure (128-GPU cluster)
|
||
</div>
|
||
</div>
|
||
""")
|
||
return
|
||
|
||
|
||
# ─── ACT I: CONCEPT SETUP ─────────────────────────────────────────────────────
|
||
|
||
|
||
@app.cell(hide_code=True)
|
||
def _(mo):
|
||
mo.vstack([
|
||
mo.md("""
|
||
## Bisection Bandwidth is the All-to-All Ceiling
|
||
|
||
**Bisection bandwidth** is the minimum bandwidth when you cut a network into two
|
||
equal halves. It is the theoretical ceiling for any all-to-all communication pattern
|
||
— the kind AllReduce uses when every GPU must exchange gradients with every other GPU.
|
||
|
||
For a **fat-tree with no oversubscription** (non-blocking), the bisection bandwidth is:
|
||
|
||
```
|
||
BW_bisection = N × link_BW / 2
|
||
```
|
||
|
||
Where `N` is the number of GPUs and `link_BW` is the bandwidth of each GPU uplink.
|
||
Every GPU can communicate at full link speed simultaneously — "no blocking" means
|
||
no link is ever the bottleneck.
|
||
|
||
When **oversubscription ratio `r`** is applied (e.g., 2:1 means half the spine
|
||
uplinks are removed), bisection bandwidth is cut by the same factor:
|
||
|
||
```
|
||
BW_bisection (oversubscribed) = N × link_BW / (2 × r)
|
||
```
|
||
|
||
A 2:1 oversubscription **halves bisection bandwidth**. AllReduce throughput for
|
||
an all-to-all workload is bounded by bisection bandwidth, so performance is also
|
||
halved — not slightly degraded.
|
||
"""),
|
||
mo.callout(mo.md(
|
||
"**The 40% observation.** A 2:1 oversubscription halves bisection bandwidth. "
|
||
"If AllReduce efficiency was ~80% of theoretical on a non-blocking fabric, "
|
||
"it becomes ~40% on the oversubscribed fabric — exactly the observation above. "
|
||
"The GPU idleness is not a software bug. It is a physics constraint."
|
||
), kind="info"),
|
||
])
|
||
return
|
||
|
||
|
||
# ─── ACT I: PREDICTION LOCK ───────────────────────────────────────────────────
|
||
|
||
|
||
@app.cell(hide_code=True)
|
||
def _(mo):
|
||
act1_prediction = mo.ui.radio(
|
||
options={
|
||
"A) 40% of expected is close enough — oversubscription rarely matters for ML workloads": "option_a",
|
||
"B) The 2:1 oversubscription halves bisection bandwidth — all-to-all patterns see 50% of expected throughput": "option_b",
|
||
"C) The issue is latency, not bandwidth — too many hops in the fat-tree add queueing delay": "option_c",
|
||
"D) 128 GPUs exceeds the practical scaling limit for fat-tree topologies": "option_d",
|
||
},
|
||
label="""**Prediction Lock — Act I.**
|
||
A 128-GPU cluster uses InfiniBand HDR200 (200 Gbps per port) with a 2:1 oversubscription
|
||
ratio at the spine layer. A non-blocking fat-tree of the same cluster would have
|
||
bisection bandwidth = 128 × 25 GB/s / 2 = 1,600 GB/s total.
|
||
|
||
The AllReduce throughput is 40% of expected. Which explanation best accounts for this?""",
|
||
)
|
||
act1_prediction
|
||
return (act1_prediction,)
|
||
|
||
|
||
@app.cell(hide_code=True)
|
||
def _(mo, act1_prediction):
|
||
mo.stop(
|
||
act1_prediction.value is None,
|
||
mo.callout(
|
||
mo.md("Select your prediction above to unlock the Bisection Bandwidth instruments."),
|
||
kind="warn",
|
||
)
|
||
)
|
||
return
|
||
|
||
|
||
# ─── ACT I: INSTRUMENTS ───────────────────────────────────────────────────────
|
||
|
||
|
||
@app.cell(hide_code=True)
|
||
def _(mo, act1_prediction):
|
||
mo.stop(act1_prediction.value is None)
|
||
|
||
act1_cluster_size = mo.ui.slider(
|
||
start=8, stop=512, value=128, step=8,
|
||
label="Cluster size N (GPUs)",
|
||
)
|
||
act1_link_bw = mo.ui.dropdown(
|
||
options={
|
||
"IB HDR200 — 200 Gbps (22.5 GB/s eff)": "ib_hdr200",
|
||
"IB NDR400 — 400 Gbps (45.0 GB/s eff)": "ib_ndr400",
|
||
"100GbE — 100 Gbps (11.0 GB/s eff)": "eth_100g",
|
||
"400GbE — 400 Gbps (44.0 GB/s eff)": "eth_400g",
|
||
},
|
||
value="IB HDR200 — 200 Gbps (22.5 GB/s eff)",
|
||
label="Link type (per GPU uplink)",
|
||
)
|
||
act1_oversub = mo.ui.dropdown(
|
||
options={
|
||
"1:1 — Non-blocking (full fat-tree)": 1.0,
|
||
"2:1 — Half spine uplinks": 2.0,
|
||
"4:1 — Quarter spine uplinks": 4.0,
|
||
},
|
||
value="1:1 — Non-blocking (full fat-tree)",
|
||
label="Oversubscription ratio",
|
||
)
|
||
|
||
mo.vstack([
|
||
mo.md("### Bisection Bandwidth Calculator"),
|
||
mo.hstack([act1_cluster_size, act1_link_bw], justify="start", gap="2rem"),
|
||
act1_oversub,
|
||
])
|
||
return (act1_cluster_size, act1_link_bw, act1_oversub)
|
||
|
||
|
||
@app.cell(hide_code=True)
|
||
def _(
|
||
mo, act1_prediction, act1_cluster_size, act1_link_bw, act1_oversub,
|
||
go, apply_plotly_theme, COLORS,
|
||
IB_HDR200_EFF_GBS, IB_NDR400_EFF_GBS, ETH_100G_EFF_GBS, ETH_400G_EFF_GBS,
|
||
):
|
||
mo.stop(act1_prediction.value is None)
|
||
|
||
# ── Link bandwidth lookup ─────────────────────────────────────────────────
|
||
_link_map = {
|
||
"ib_hdr200": IB_HDR200_EFF_GBS,
|
||
"ib_ndr400": IB_NDR400_EFF_GBS,
|
||
"eth_100g": ETH_100G_EFF_GBS,
|
||
"eth_400g": ETH_400G_EFF_GBS,
|
||
}
|
||
_link_label_map = {
|
||
"ib_hdr200": "IB HDR200",
|
||
"ib_ndr400": "IB NDR400",
|
||
"eth_100g": "100GbE",
|
||
"eth_400g": "400GbE",
|
||
}
|
||
_link_bw_gbs = _link_map[act1_link_bw.value]
|
||
_link_label = _link_label_map[act1_link_bw.value]
|
||
_N = act1_cluster_size.value
|
||
_r = act1_oversub.value
|
||
|
||
# ── Bisection bandwidth formula ───────────────────────────────────────────
|
||
# BW_bisection = N × link_BW / (2 × oversubscription_ratio)
|
||
# Source: @sec-network-fabrics-bisection-bandwidth
|
||
_bw_full = _N * _link_bw_gbs / 2.0 # GB/s, non-blocking
|
||
_bw_oversub = _N * _link_bw_gbs / (2.0 * _r) # GB/s, with oversubscription
|
||
_bw_per_gpu = _bw_oversub / _N # GB/s per GPU (bisection share)
|
||
|
||
# ── AllReduce bandwidth efficiency ────────────────────────────────────────
|
||
# Ring-AllReduce sends 2*(N-1)/N * message_size bytes total per GPU.
|
||
# For large N, effective per-GPU AllReduce bandwidth ≈ bisection_bw / N.
|
||
# Efficiency relative to non-blocking fabric:
|
||
_efficiency_pct = (_bw_oversub / _bw_full) * 100.0 if _bw_full > 0 else 0.0
|
||
|
||
# ── Bottleneck classification ─────────────────────────────────────────────
|
||
if _r <= 1.0:
|
||
_fabric_status = "Non-blocking"
|
||
_status_color = COLORS["GreenLine"]
|
||
_status_bg = COLORS["GreenLL"]
|
||
elif _r <= 2.0:
|
||
_fabric_status = "2:1 Oversubscribed"
|
||
_status_color = COLORS["OrangeLine"]
|
||
_status_bg = COLORS["OrangeLL"]
|
||
else:
|
||
_fabric_status = "4:1 Oversubscribed"
|
||
_status_color = COLORS["RedLine"]
|
||
_status_bg = COLORS["RedLL"]
|
||
|
||
# ── Comparison bar chart: topology bandwidth comparison ───────────────────
|
||
_oversub_labels = ["1:1 (Non-blocking)", "2:1 Oversubscribed", "4:1 Oversubscribed"]
|
||
_oversub_ratios = [1.0, 2.0, 4.0]
|
||
_bw_values = [_N * _link_bw_gbs / (2.0 * r) for r in _oversub_ratios]
|
||
_bar_colors = [COLORS["GreenLine"], COLORS["OrangeLine"], COLORS["RedLine"]]
|
||
_selected_idx = _oversub_ratios.index(_r)
|
||
|
||
_fig = go.Figure()
|
||
|
||
_fig.add_trace(go.Bar(
|
||
x=_oversub_labels,
|
||
y=_bw_values,
|
||
marker_color=[
|
||
_bar_colors[i] if i != _selected_idx else "#1e293b"
|
||
for i in range(len(_bar_colors))
|
||
],
|
||
marker_line_color=[
|
||
_bar_colors[i] for i in range(len(_bar_colors))
|
||
],
|
||
marker_line_width=[
|
||
3 if i == _selected_idx else 1 for i in range(len(_bar_colors))
|
||
],
|
||
opacity=[0.55 if i != _selected_idx else 1.0 for i in range(3)],
|
||
text=[f"{v:,.0f} GB/s" for v in _bw_values],
|
||
textposition="outside",
|
||
textfont=dict(size=11, family="SF Mono, monospace"),
|
||
width=0.55,
|
||
))
|
||
|
||
# Highlight selected configuration
|
||
_fig.add_trace(go.Scatter(
|
||
x=[_oversub_labels[_selected_idx]],
|
||
y=[_bw_values[_selected_idx]],
|
||
mode="markers",
|
||
marker=dict(
|
||
symbol="star",
|
||
size=14,
|
||
color=_bar_colors[_selected_idx],
|
||
line=dict(color="white", width=1),
|
||
),
|
||
name="Current config",
|
||
showlegend=False,
|
||
))
|
||
|
||
_fig.update_layout(
|
||
title=dict(
|
||
text=f"Bisection Bandwidth by Oversubscription — {_N}-GPU cluster, {_link_label}",
|
||
font=dict(size=13, color="#1e293b"),
|
||
x=0,
|
||
),
|
||
height=320,
|
||
yaxis=dict(title="Total Bisection Bandwidth (GB/s)", gridcolor="#f1f5f9"),
|
||
xaxis=dict(title="Oversubscription Ratio"),
|
||
showlegend=False,
|
||
margin=dict(l=60, r=20, t=50, b=40),
|
||
)
|
||
apply_plotly_theme(_fig)
|
||
|
||
# ── Metric cards ──────────────────────────────────────────────────────────
|
||
_cards_html = f"""
|
||
<div style="display:flex; gap:16px; flex-wrap:wrap; margin:16px 0;">
|
||
<div style="padding:18px; border:1px solid #e2e8f0; border-radius:10px;
|
||
min-width:175px; text-align:center; background:white;
|
||
border-top:3px solid {_status_color};">
|
||
<div style="color:#94a3b8; font-size:0.78rem; font-weight:600; margin-bottom:4px;">
|
||
Total Bisection BW
|
||
</div>
|
||
<div style="font-size:1.35rem; font-weight:800; color:{_status_color};
|
||
font-family:'SF Mono',monospace;">
|
||
{_bw_oversub:,.0f} GB/s
|
||
</div>
|
||
<div style="font-size:0.72rem; color:#94a3b8; margin-top:4px;">
|
||
{_fabric_status}
|
||
</div>
|
||
</div>
|
||
<div style="padding:18px; border:1px solid #e2e8f0; border-radius:10px;
|
||
min-width:175px; text-align:center; background:white;
|
||
border-top:3px solid {COLORS['BlueLine']};">
|
||
<div style="color:#94a3b8; font-size:0.78rem; font-weight:600; margin-bottom:4px;">
|
||
Per-GPU BW Share
|
||
</div>
|
||
<div style="font-size:1.35rem; font-weight:800; color:{COLORS['BlueLine']};
|
||
font-family:'SF Mono',monospace;">
|
||
{_bw_per_gpu:.2f} GB/s
|
||
</div>
|
||
<div style="font-size:0.72rem; color:#94a3b8; margin-top:4px;">
|
||
of {_link_bw_gbs:.1f} GB/s link speed
|
||
</div>
|
||
</div>
|
||
<div style="padding:18px; border:1px solid #e2e8f0; border-radius:10px;
|
||
min-width:175px; text-align:center; background:white;
|
||
border-top:3px solid {'#008F45' if _efficiency_pct >= 90 else '#CC5500' if _efficiency_pct >= 60 else '#CB202D'};">
|
||
<div style="color:#94a3b8; font-size:0.78rem; font-weight:600; margin-bottom:4px;">
|
||
AllReduce Efficiency
|
||
</div>
|
||
<div style="font-size:1.35rem; font-weight:800;
|
||
color:{'#008F45' if _efficiency_pct >= 90 else '#CC5500' if _efficiency_pct >= 60 else '#CB202D'};
|
||
font-family:'SF Mono',monospace;">
|
||
{_efficiency_pct:.0f}%
|
||
</div>
|
||
<div style="font-size:0.72rem; color:#94a3b8; margin-top:4px;">
|
||
vs non-blocking fabric
|
||
</div>
|
||
</div>
|
||
<div style="padding:18px; border:1px solid #e2e8f0; border-radius:10px;
|
||
min-width:175px; text-align:center; background:white;
|
||
border-top:3px solid {COLORS['BlueLine']};">
|
||
<div style="color:#94a3b8; font-size:0.78rem; font-weight:600; margin-bottom:4px;">
|
||
Non-blocking Peak
|
||
</div>
|
||
<div style="font-size:1.35rem; font-weight:800; color:{COLORS['TextSec']};
|
||
font-family:'SF Mono',monospace;">
|
||
{_bw_full:,.0f} GB/s
|
||
</div>
|
||
<div style="font-size:0.72rem; color:#94a3b8; margin-top:4px;">
|
||
1:1 fat-tree reference
|
||
</div>
|
||
</div>
|
||
</div>
|
||
"""
|
||
|
||
# ── Physics formula display ───────────────────────────────────────────────
|
||
_formula_text = f"""
|
||
**Bisection Bandwidth — Live Calculation** (N={_N} GPUs, link={_link_bw_gbs:.1f} GB/s, r={_r:.0f}:1)
|
||
|
||
```
|
||
BW_bisection = N × link_BW / (2 × r)
|
||
= {_N} × {_link_bw_gbs:.1f} GB/s / (2 × {_r:.0f})
|
||
= {_bw_oversub:,.1f} GB/s ← {_fabric_status}
|
||
|
||
Non-blocking = {_N} × {_link_bw_gbs:.1f} GB/s / 2
|
||
= {_bw_full:,.1f} GB/s ← full fat-tree
|
||
|
||
AllReduce efficiency = {_bw_oversub:,.1f} / {_bw_full:,.1f}
|
||
= {_efficiency_pct:.1f}% ← degradation from oversubscription
|
||
```
|
||
"""
|
||
|
||
mo.vstack([
|
||
mo.Html(_cards_html),
|
||
mo.md(_formula_text),
|
||
mo.ui.plotly(_fig),
|
||
])
|
||
return (
|
||
_bw_oversub, _bw_full, _efficiency_pct,
|
||
_N, _r, _link_bw_gbs, _link_label,
|
||
)
|
||
|
||
|
||
# ─── ACT I: PREDICTION VS REALITY OVERLAY ─────────────────────────────────────
|
||
|
||
|
||
@app.cell(hide_code=True)
|
||
def _(mo, act1_prediction, _bw_oversub, _bw_full, _efficiency_pct, _r):
|
||
mo.stop(act1_prediction.value is None)
|
||
|
||
# Prediction-vs-reality overlay
|
||
# The 2:1 oversubscription halves bisection BW → exactly option_b
|
||
_predicted_pct = {
|
||
"option_a": 90.0, # Student predicted oversubscription "rarely matters"
|
||
"option_b": 50.0, # Student predicted 50% — correct physics
|
||
"option_c": 75.0, # Student predicted latency (hops), not BW reduction
|
||
"option_d": 20.0, # Student predicted capacity limit, not oversubscription
|
||
}[act1_prediction.value]
|
||
|
||
_actual_pct = _efficiency_pct
|
||
_gap = abs(_actual_pct - _predicted_pct)
|
||
_is_correct = act1_prediction.value == "option_b"
|
||
|
||
if _is_correct:
|
||
_overlay = mo.callout(mo.md(
|
||
f"**Correct.** Your prediction of {_predicted_pct:.0f}% matches the physics. "
|
||
f"The actual AllReduce efficiency for {_r:.0f}:1 oversubscription is "
|
||
f"**{_actual_pct:.0f}%** of a non-blocking fabric — "
|
||
f"bisection bandwidth is cut by exactly the oversubscription ratio. "
|
||
f"There is no partial degradation: the halved spine capacity directly halves "
|
||
f"the cross-bisection bandwidth available for AllReduce."
|
||
), kind="success")
|
||
else:
|
||
_overlay = mo.callout(mo.md(
|
||
f"**Not quite.** You predicted {_predicted_pct:.0f}% efficiency. "
|
||
f"The actual value for {_r:.0f}:1 oversubscription is **{_actual_pct:.0f}%** "
|
||
f"— off by {_gap:.0f} percentage points. "
|
||
f"The correct answer is B: the 2:1 oversubscription **halves** bisection "
|
||
f"bandwidth (from {_bw_full:,.0f} GB/s to {_bw_oversub:,.0f} GB/s). "
|
||
f"AllReduce is an all-to-all pattern — its throughput is bounded by bisection "
|
||
f"bandwidth, so the degradation is proportional, not incidental."
|
||
), kind="warn")
|
||
|
||
_overlay
|
||
return
|
||
|
||
|
||
# ─── ACT I: MATHPEEK ACCORDION ────────────────────────────────────────────────
|
||
|
||
|
||
@app.cell(hide_code=True)
|
||
def _(mo, act1_prediction):
|
||
mo.stop(act1_prediction.value is None)
|
||
|
||
mo.accordion({
|
||
"The governing equations: Fat-tree bisection bandwidth": mo.md("""
|
||
**Bisection Bandwidth Formula (fat-tree):**
|
||
|
||
```
|
||
BW_bisection = N × link_BW / (2 × r)
|
||
```
|
||
|
||
- **N** — Number of end-hosts (GPUs) in the cluster
|
||
- **link_BW** — Unidirectional bandwidth of each GPU uplink (GB/s)
|
||
- **r** — Oversubscription ratio (1 = non-blocking, 2 = half spine, 4 = quarter spine)
|
||
- **2** — Factor of 2 because bisection cuts the network in half;
|
||
each half can send at most half the total uplink bandwidth
|
||
|
||
**Non-blocking condition (r = 1):**
|
||
|
||
```
|
||
BW_bisection_max = N × link_BW / 2
|
||
```
|
||
|
||
A k-ary fat-tree has k/2 spine switches, each with k ports.
|
||
Full non-blocking requires as many uplink ports at each layer as downlink ports.
|
||
Oversubscription reduces spine uplinks, creating the bottleneck at the bisection cut.
|
||
|
||
**AllReduce time (ring-allreduce, large N):**
|
||
|
||
```
|
||
T_allreduce ≈ 2 × M / (BW_bisection / N)
|
||
= 2 × N × M / BW_bisection
|
||
```
|
||
|
||
- **M** — Message size (gradient tensor, GB)
|
||
- The "2" comes from the two phases of ring-allreduce (reduce-scatter + all-gather)
|
||
|
||
**Scaling behavior by topology:**
|
||
|
||
| Topology | Bisection BW scales as | Notes |
|
||
|----------------|------------------------|------------------------------|
|
||
| Full fat-tree | O(N) | Linear — ideal for AllReduce |
|
||
| 2:1 fat-tree | O(N/2) | Linear, but halved |
|
||
| 3D-torus | O(N^(2/3)) | Sub-linear — poor at scale |
|
||
| Ring | O(1) per node | Fixed 2 links per GPU |
|
||
"""),
|
||
})
|
||
return
|
||
|
||
|
||
# ─── ACT I: STRUCTURED REFLECTION ─────────────────────────────────────────────
|
||
|
||
|
||
@app.cell(hide_code=True)
|
||
def _(mo, act1_prediction):
|
||
mo.stop(act1_prediction.value is None)
|
||
|
||
mo.md("---")
|
||
return
|
||
|
||
|
||
@app.cell(hide_code=True)
|
||
def _(mo, act1_prediction):
|
||
mo.stop(act1_prediction.value is None)
|
||
|
||
act1_reflection = mo.ui.radio(
|
||
options={
|
||
"A) All switches have equal port count — k ports in, k ports out at every layer": "ref_a",
|
||
"B) Every leaf-to-leaf path has the same bandwidth as a direct link — no bandwidth bottleneck at any layer": "ref_b",
|
||
"C) The tree has exactly 3 layers: edge, aggregation, core": "ref_c",
|
||
"D) Maximum path length is log₂(N) hops, minimizing queueing latency": "ref_d",
|
||
},
|
||
label="""**Reflection — Act I.**
|
||
What is the defining property of a non-blocking fat-tree topology?""",
|
||
)
|
||
act1_reflection
|
||
return (act1_reflection,)
|
||
|
||
|
||
@app.cell(hide_code=True)
|
||
def _(mo, act1_prediction, act1_reflection):
|
||
mo.stop(act1_prediction.value is None)
|
||
mo.stop(act1_reflection.value is None, mo.callout(
|
||
mo.md("Select your answer to see the explanation."), kind="warn"
|
||
))
|
||
|
||
_ref_correct = act1_reflection.value == "ref_b"
|
||
|
||
_ref_feedback = {
|
||
"ref_a": (
|
||
"**Not quite.** Equal port counts is a property of symmetric switches "
|
||
"but does not guarantee non-blocking behavior. A switch can have equal "
|
||
"ingress and egress ports and still create bandwidth contention if "
|
||
"aggregation-layer uplinks are fewer than downlinks — which is exactly "
|
||
"what oversubscription does."
|
||
),
|
||
"ref_b": (
|
||
"**Correct.** A non-blocking fat-tree guarantees that any leaf-to-leaf "
|
||
"path can sustain full link bandwidth simultaneously with all other "
|
||
"leaf-to-leaf paths. This requires that at every layer, the total "
|
||
"uplink capacity equals the total downlink capacity. When this holds, "
|
||
"no switch layer is ever a bandwidth bottleneck — bisection bandwidth "
|
||
"equals N × link_BW / 2, the theoretical maximum."
|
||
),
|
||
"ref_c": (
|
||
"**Not quite.** Fat-trees are commonly depicted with 3 layers "
|
||
"(edge / aggregation / core), but this is a specific implementation "
|
||
"choice, not the defining property. A fat-tree can have more layers. "
|
||
"The defining property is the bandwidth guarantee at every cut, "
|
||
"not the number of layers."
|
||
),
|
||
"ref_d": (
|
||
"**Not quite.** O(log N) path length is a property of fat-trees "
|
||
"but is not what makes them non-blocking. Latency and bandwidth are "
|
||
"independent properties. A topology can have short paths and still "
|
||
"be bandwidth-bottlenecked at the bisection if uplink capacity is "
|
||
"insufficient."
|
||
),
|
||
}
|
||
|
||
mo.callout(
|
||
mo.md(_ref_feedback[act1_reflection.value]),
|
||
kind="success" if _ref_correct else "warn",
|
||
)
|
||
return
|
||
|
||
|
||
# ═════════════════════════════════════════════════════════════════════════════
|
||
# ACT II — THE 1024-GPU FABRIC DESIGN
|
||
# ═════════════════════════════════════════════════════════════════════════════
|
||
|
||
|
||
@app.cell(hide_code=True)
|
||
def _(mo, act1_prediction, act1_reflection):
|
||
mo.stop(act1_prediction.value is None)
|
||
mo.stop(act1_reflection.value is None)
|
||
|
||
mo.vstack([
|
||
mo.Html("""<div style="margin: 24px 0 8px 0;">
|
||
<div style="display:flex; align-items:center; gap:12px;">
|
||
<div style="background:#CB202D; color:white; border-radius:50%;
|
||
width:28px; height:28px; display:inline-flex; align-items:center;
|
||
justify-content:center; font-size:0.85rem; font-weight:800;
|
||
flex-shrink:0;">II</div>
|
||
<div style="flex:1; height:2px; background:#e2e8f0;"></div>
|
||
<div style="font-size:0.72rem; font-weight:700; color:#94a3b8;
|
||
text-transform:uppercase; letter-spacing:0.12em;">
|
||
Act II · 20–25 min
|
||
</div>
|
||
</div>
|
||
<div style="font-size:1.6rem; font-weight:800; color:#0f172a;
|
||
margin-top:8px; line-height:1.2;">
|
||
The 1024-GPU Fabric Design
|
||
</div>
|
||
<div style="color:#475569; font-size:0.95rem; margin-top:4px;">
|
||
$50M hardware budget. Three fabric options. One dominant AllReduce workload.
|
||
</div>
|
||
</div>"""),
|
||
])
|
||
return
|
||
|
||
|
||
# ─── ACT II: STAKEHOLDER MESSAGE ──────────────────────────────────────────────
|
||
|
||
|
||
@app.cell(hide_code=True)
|
||
def _(mo, act1_prediction, act1_reflection, COLORS):
|
||
mo.stop(act1_prediction.value is None)
|
||
mo.stop(act1_reflection.value is None)
|
||
|
||
_color = COLORS["OrangeLine"]
|
||
_bg = COLORS["OrangeL"]
|
||
mo.Html(f"""
|
||
<div style="border-left:4px solid {_color}; background:{_bg};
|
||
border-radius:0 10px 10px 0; padding:16px 22px; margin:12px 0;">
|
||
<div style="font-size:0.72rem; font-weight:700; color:{_color};
|
||
text-transform:uppercase; letter-spacing:0.1em; margin-bottom:6px;">
|
||
Incoming Message · Infrastructure VP, Apex Foundation Models
|
||
</div>
|
||
<div style="font-style:italic; font-size:1.0rem; color:#1e293b; line-height:1.65;">
|
||
"We are building a 1024-GPU cluster for large language model pre-training.
|
||
Total hardware budget: $50M, with $20–25M allocated to compute (GPUs).
|
||
I have three fabric proposals on my desk:
|
||
(A) 400G InfiniBand NDR fat-tree — $20M, 1:1 non-blocking;
|
||
(B) 200G Ethernet fat-tree — $8M, 2:1 oversubscription;
|
||
(C) 3D-torus — $5M, fixed bisection per plane.
|
||
Our primary workload is Llama-class model training with large AllReduce passes
|
||
(32 GB gradient tensors every iteration). Which fabric do I choose?"
|
||
</div>
|
||
<div style="font-size:0.78rem; color:#475569; margin-top:8px; font-weight:600;">
|
||
— Dr. Amara Osei, VP Infrastructure · Apex Foundation Models (1024-GPU cluster)
|
||
</div>
|
||
</div>
|
||
""")
|
||
return
|
||
|
||
|
||
# ─── ACT II: CONCEPT SETUP ────────────────────────────────────────────────────
|
||
|
||
|
||
@app.cell(hide_code=True)
|
||
def _(mo, act1_prediction, act1_reflection):
|
||
mo.stop(act1_prediction.value is None)
|
||
mo.stop(act1_reflection.value is None)
|
||
|
||
mo.vstack([
|
||
mo.md("""
|
||
## Three Fabrics, One Bottleneck
|
||
|
||
Each fabric architecture has a different bisection bandwidth model at scale:
|
||
|
||
**InfiniBand NDR fat-tree (1:1 non-blocking):**
|
||
All 1024 GPU uplinks have matching spine capacity. Bisection bandwidth scales
|
||
linearly with cluster size. The most expensive option per port, but no contention.
|
||
|
||
**Ethernet fat-tree (2:1 oversubscribed):**
|
||
Half the spine uplinks are removed to reduce cost. Bisection bandwidth is halved.
|
||
AllReduce sees only 50% of the per-GPU link speed at the bisection.
|
||
|
||
**3D-torus:**
|
||
Each GPU connects to 6 neighbors in three dimensions. There is no central spine.
|
||
Bisection bandwidth cuts one dimension of the torus — for a 1024-GPU cube,
|
||
that is a 10×10 plane of links. Bisection bandwidth scales as O(N^(2/3)), not O(N).
|
||
|
||
```
|
||
3D-torus bisection BW = (N)^(2/3) × link_BW
|
||
Fat-tree bisection BW = N × link_BW / (2 × r)
|
||
```
|
||
|
||
At small N, the 3D-torus can be competitive. At 1024 GPUs, the fat-tree
|
||
advantage is pronounced. At 16,384 GPUs, the torus bisection bandwidth is
|
||
roughly **16× lower** per GPU than a non-blocking fat-tree.
|
||
"""),
|
||
mo.callout(mo.md(
|
||
"**AllReduce at scale.** Every iteration of large-model training exchanges "
|
||
"gradient tensors across all GPUs. For a 70B-parameter model in FP16, "
|
||
"each AllReduce moves ~140 GB of data. The time to complete AllReduce "
|
||
"determines whether GPUs are computing or waiting — it is the direct "
|
||
"multiplier on total training time."
|
||
), kind="info"),
|
||
])
|
||
return
|
||
|
||
|
||
# ─── ACT II: PREDICTION LOCK ──────────────────────────────────────────────────
|
||
|
||
|
||
@app.cell(hide_code=True)
|
||
def _(mo, act1_prediction, act1_reflection):
|
||
mo.stop(act1_prediction.value is None)
|
||
mo.stop(act1_reflection.value is None)
|
||
|
||
act2_prediction = mo.ui.radio(
|
||
options={
|
||
"A) Option C — 3D-torus: cheapest option and bandwidth scales well for mesh workloads": "pred_torus",
|
||
"B) Option B — Ethernet 2:1: good enough at 50% of IB performance for 40% of the cost": "pred_eth",
|
||
"C) Option A — IB NDR fat-tree: AllReduce is all-to-all, bisection bandwidth is the bottleneck": "pred_ib",
|
||
"D) All three topologies perform equally for AllReduce — link speed is all that matters": "pred_equal",
|
||
},
|
||
label="""**Prediction Lock — Act II.**
|
||
For a 1024-GPU cluster running 32 GB AllReduce tensors with a target AllReduce
|
||
completion time < 1 second, which fabric design should the Infrastructure VP choose?""",
|
||
)
|
||
act2_prediction
|
||
return (act2_prediction,)
|
||
|
||
|
||
@app.cell(hide_code=True)
|
||
def _(mo, act1_prediction, act1_reflection, act2_prediction):
|
||
mo.stop(act1_prediction.value is None)
|
||
mo.stop(act1_reflection.value is None)
|
||
mo.stop(
|
||
act2_prediction.value is None,
|
||
mo.callout(
|
||
mo.md("Select your prediction above to unlock the Fabric Design instruments."),
|
||
kind="warn",
|
||
)
|
||
)
|
||
return
|
||
|
||
|
||
# ─── ACT II: INSTRUMENTS ──────────────────────────────────────────────────────
|
||
|
||
|
||
@app.cell(hide_code=True)
|
||
def _(mo, act1_prediction, act1_reflection, act2_prediction):
|
||
mo.stop(act1_prediction.value is None)
|
||
mo.stop(act1_reflection.value is None)
|
||
mo.stop(act2_prediction.value is None)
|
||
|
||
act2_fabric = mo.ui.dropdown(
|
||
options={
|
||
"IB NDR fat-tree 1:1 (non-blocking)": "ib_fat_tree",
|
||
"Ethernet fat-tree 2:1 (oversubscribed)": "eth_fat_tree",
|
||
"3D-torus (fixed bisection)": "torus_3d",
|
||
},
|
||
value="IB NDR fat-tree 1:1 (non-blocking)",
|
||
label="Fabric type",
|
||
)
|
||
act2_cluster_n = mo.ui.slider(
|
||
start=64, stop=4096, value=1024, step=64,
|
||
label="Cluster size N (GPUs)",
|
||
)
|
||
act2_model_gb = mo.ui.slider(
|
||
start=1, stop=200, value=32, step=1,
|
||
label="AllReduce tensor size (GB)",
|
||
)
|
||
|
||
mo.vstack([
|
||
mo.md("### Fabric Design Dashboard"),
|
||
mo.hstack([act2_fabric, act2_cluster_n], justify="start", gap="2rem"),
|
||
act2_model_gb,
|
||
])
|
||
return (act2_fabric, act2_cluster_n, act2_model_gb)
|
||
|
||
|
||
@app.cell(hide_code=True)
|
||
def _(
|
||
mo, act1_prediction, act1_reflection, act2_prediction,
|
||
act2_fabric, act2_cluster_n, act2_model_gb,
|
||
go, apply_plotly_theme, COLORS, math,
|
||
IB_NDR400_EFF_GBS, ETH_100G_EFF_GBS,
|
||
IB_NDR_FABRIC_COST_M, ETH_400G_FABRIC_COST_M, TORUS_3D_FABRIC_COST_M,
|
||
):
|
||
mo.stop(act1_prediction.value is None)
|
||
mo.stop(act1_reflection.value is None)
|
||
mo.stop(act2_prediction.value is None)
|
||
|
||
_fabric = act2_fabric.value
|
||
_N = act2_cluster_n.value
|
||
_msg_gb = act2_model_gb.value
|
||
|
||
# ── Per-fabric bisection bandwidth physics ────────────────────────────────
|
||
# Source: @sec-network-fabrics-topology-comparison
|
||
|
||
if _fabric == "ib_fat_tree":
|
||
# IB NDR400 fat-tree, 1:1 non-blocking
|
||
# link_BW = 45.0 GB/s per GPU (NDR400 effective unidirectional)
|
||
_link_bw_gbs = IB_NDR400_EFF_GBS # 45.0 GB/s
|
||
_oversub_ratio = 1.0
|
||
_bw_bisect = _N * _link_bw_gbs / (2.0 * _oversub_ratio) # O(N)
|
||
_fabric_name = "IB NDR fat-tree (1:1)"
|
||
_fabric_color = COLORS["GreenLine"]
|
||
_fabric_cost_m = IB_NDR_FABRIC_COST_M # $20M
|
||
_topology_note = "Non-blocking · O(N) bisection BW"
|
||
elif _fabric == "eth_fat_tree":
|
||
# Ethernet 100GbE fat-tree, 2:1 oversubscription
|
||
# Using 100GbE at $8M price point (200GbE ports at 2:1 = 100G effective)
|
||
_link_bw_gbs = ETH_100G_EFF_GBS * 2.0 # 22.0 GB/s (200GbE effective)
|
||
_oversub_ratio = 2.0
|
||
_bw_bisect = _N * _link_bw_gbs / (2.0 * _oversub_ratio) # O(N/2)
|
||
_fabric_name = "Ethernet fat-tree (2:1)"
|
||
_fabric_color = COLORS["OrangeLine"]
|
||
_fabric_cost_m = ETH_400G_FABRIC_COST_M # $8M
|
||
_topology_note = "2:1 Oversubscribed · O(N/2) bisection BW"
|
||
else:
|
||
# 3D-torus: bisection BW = N^(2/3) × link_BW per plane
|
||
# Each GPU has 6 neighbors (±x, ±y, ±z); bisection cuts one dimension
|
||
# Bisection = (N)^(2/3) × link_BW / 2 (cut through a cross-sectional plane)
|
||
# Source: @sec-network-fabrics-topology-comparison (torus bisection derivation)
|
||
_link_bw_gbs = ETH_100G_EFF_GBS # 11.0 GB/s (cost-equivalent links)
|
||
_plane_links = int(_N ** (2.0/3.0)) # number of links crossing bisection
|
||
_bw_bisect = _plane_links * _link_bw_gbs / 2.0 # O(N^(2/3))
|
||
_fabric_name = "3D-torus"
|
||
_fabric_color = COLORS["RedLine"]
|
||
_fabric_cost_m = TORUS_3D_FABRIC_COST_M # $5M
|
||
_topology_note = f"Fixed mesh · O(N^(2/3)) bisection BW · plane links={_plane_links}"
|
||
|
||
# ── AllReduce time calculation ─────────────────────────────────────────────
|
||
# Ring-allreduce: T = 2 × (N-1)/N × M / (BW_bisection / N)
|
||
# For large N → T ≈ 2 × M / (BW_bisection / N) = 2 × N × M / BW_bisection
|
||
# Source: @sec-network-fabrics-allreduce (Rabenseifner ring-allreduce analysis)
|
||
_bw_per_gpu = _bw_bisect / _N # GB/s per GPU
|
||
_factor = 2.0 * (_N - 1) / _N if _N > 1 else 2.0
|
||
_allreduce_sec = _factor * _msg_gb / _bw_per_gpu if _bw_per_gpu > 0 else float("inf")
|
||
|
||
# ── SLA threshold: AllReduce must complete < 1 second ─────────────────────
|
||
# Target from stakeholder: AllReduce < 1s for productive training throughput
|
||
# Source: @sec-network-fabrics (AllReduce latency target, LLM training)
|
||
_SLA_SEC = 1.0
|
||
_sla_violated = _allreduce_sec > _SLA_SEC
|
||
|
||
# ── Training efficiency (fraction of time GPUs are computing, not waiting) ─
|
||
# Assume compute time per iteration ≈ 5 seconds for a 70B-parameter model step
|
||
# on 1024 GPUs; AllReduce is the synchronization barrier.
|
||
# Source: rough model from @sec-network-fabrics (communication overhead)
|
||
_COMPUTE_SEC = 5.0
|
||
_train_eff_pct = _COMPUTE_SEC / (_COMPUTE_SEC + _allreduce_sec) * 100.0
|
||
_eff_color = (
|
||
COLORS["GreenLine"] if _train_eff_pct >= 85 else
|
||
COLORS["OrangeLine"] if _train_eff_pct >= 60 else
|
||
COLORS["RedLine"]
|
||
)
|
||
|
||
# ── Cost-performance ratio ────────────────────────────────────────────────
|
||
# How many GB/s of bisection bandwidth per $1M of fabric cost
|
||
_bw_per_dollar_m = _bw_bisect / _fabric_cost_m if _fabric_cost_m > 0 else 0.0
|
||
|
||
# ── Topology comparison across cluster sizes ───────────────────────────────
|
||
_n_range = list(range(64, 4097, 64))
|
||
_bw_ib_fat = [n * IB_NDR400_EFF_GBS / 2.0 for n in _n_range]
|
||
_bw_eth_fat = [n * ETH_100G_EFF_GBS * 2.0 / (2 * 2.0) for n in _n_range]
|
||
_bw_torus = [int(n ** (2.0/3.0)) * ETH_100G_EFF_GBS / 2.0 for n in _n_range]
|
||
|
||
_fig = go.Figure()
|
||
|
||
_fig.add_trace(go.Scatter(
|
||
x=_n_range, y=_bw_ib_fat, mode="lines", name="IB NDR fat-tree (1:1)",
|
||
line=dict(color=COLORS["GreenLine"], width=2.5),
|
||
))
|
||
_fig.add_trace(go.Scatter(
|
||
x=_n_range, y=_bw_eth_fat, mode="lines", name="Ethernet fat-tree (2:1)",
|
||
line=dict(color=COLORS["OrangeLine"], width=2.5, dash="dash"),
|
||
))
|
||
_fig.add_trace(go.Scatter(
|
||
x=_n_range, y=_bw_torus, mode="lines", name="3D-torus",
|
||
line=dict(color=COLORS["RedLine"], width=2.5, dash="dot"),
|
||
))
|
||
|
||
# Mark the current configuration
|
||
_fig.add_trace(go.Scatter(
|
||
x=[_N], y=[_bw_bisect], mode="markers",
|
||
name=f"Current: {_fabric_name}",
|
||
marker=dict(symbol="star", size=14, color=_fabric_color,
|
||
line=dict(color="white", width=1.5)),
|
||
showlegend=True,
|
||
))
|
||
|
||
# SLA line (AllReduce < 1s → minimum bisection BW needed)
|
||
# T = 2 × N × M / BW_bisect < 1s → BW_bisect > 2 × N × M
|
||
_min_bw_for_sla = [2.0 * n * _msg_gb for n in _n_range]
|
||
_fig.add_trace(go.Scatter(
|
||
x=_n_range, y=_min_bw_for_sla, mode="lines",
|
||
name=f"Min BW for <1s AllReduce ({_msg_gb} GB tensor)",
|
||
line=dict(color=COLORS["RedLine"], width=1.5, dash="longdash"),
|
||
opacity=0.65,
|
||
))
|
||
|
||
_fig.update_layout(
|
||
title=dict(
|
||
text="Bisection Bandwidth vs Cluster Size by Topology",
|
||
font=dict(size=13, color="#1e293b"), x=0,
|
||
),
|
||
height=380,
|
||
yaxis=dict(title="Total Bisection Bandwidth (GB/s)", gridcolor="#f1f5f9"),
|
||
xaxis=dict(title="Cluster Size N (GPUs)"),
|
||
legend=dict(orientation="h", y=-0.22, x=0, font=dict(size=10)),
|
||
margin=dict(l=60, r=20, t=50, b=100),
|
||
)
|
||
apply_plotly_theme(_fig)
|
||
|
||
# ── Cost breakdown bar chart ──────────────────────────────────────────────
|
||
_fabrics = ["IB NDR fat-tree\n(1:1)", "Ethernet fat-tree\n(2:1)", "3D-torus"]
|
||
_costs_m = [IB_NDR_FABRIC_COST_M, ETH_400G_FABRIC_COST_M, TORUS_3D_FABRIC_COST_M]
|
||
_bw_at_1024 = [
|
||
1024 * IB_NDR400_EFF_GBS / 2.0,
|
||
1024 * ETH_100G_EFF_GBS * 2.0 / (2 * 2.0),
|
||
int(1024 ** (2.0/3.0)) * ETH_100G_EFF_GBS / 2.0,
|
||
]
|
||
_ar_times = [
|
||
2.0 * _msg_gb / (bw / 1024) for bw in _bw_at_1024
|
||
]
|
||
_cost_colors = [COLORS["GreenLine"], COLORS["OrangeLine"], COLORS["RedLine"]]
|
||
_selected_fab_idx = ["ib_fat_tree", "eth_fat_tree", "torus_3d"].index(_fabric)
|
||
|
||
_fig2 = go.Figure()
|
||
_fig2.add_trace(go.Bar(
|
||
x=_fabrics,
|
||
y=_costs_m,
|
||
marker_color=[
|
||
c if i == _selected_fab_idx else c
|
||
for i, c in enumerate(_cost_colors)
|
||
],
|
||
marker_line_color=["white"] * 3,
|
||
marker_line_width=[0] * 3,
|
||
opacity=[1.0 if i == _selected_fab_idx else 0.45 for i in range(3)],
|
||
text=[f"${c:.0f}M" for c in _costs_m],
|
||
textposition="outside",
|
||
textfont=dict(size=11, family="SF Mono, monospace"),
|
||
width=0.55,
|
||
name="Fabric cost ($M)",
|
||
))
|
||
|
||
# Overlay: AllReduce time as text annotation
|
||
for _i, (_fab, _ar_t) in enumerate(zip(_fabrics, _ar_times)):
|
||
_ar_color = "#008F45" if _ar_t < 1.0 else "#CB202D"
|
||
_fig2.add_annotation(
|
||
x=_fab, y=_costs_m[_i] + 0.5,
|
||
text=f"AllReduce: {_ar_t:.2f}s",
|
||
showarrow=False,
|
||
font=dict(size=9, color=_ar_color, family="SF Mono, monospace"),
|
||
)
|
||
|
||
_fig2.update_layout(
|
||
title=dict(
|
||
text=f"Fabric Cost vs AllReduce Time — 1024 GPUs, {_msg_gb} GB tensor",
|
||
font=dict(size=13, color="#1e293b"), x=0,
|
||
),
|
||
height=320,
|
||
yaxis=dict(title="Fabric Cost ($M)", gridcolor="#f1f5f9"),
|
||
xaxis=dict(title="Fabric Type"),
|
||
showlegend=False,
|
||
margin=dict(l=60, r=20, t=50, b=40),
|
||
)
|
||
apply_plotly_theme(_fig2)
|
||
|
||
# ── Physics formula display ───────────────────────────────────────────────
|
||
if _fabric == "torus_3d":
|
||
_plane_links_disp = int(_N ** (2.0/3.0))
|
||
_formula_text = f"""
|
||
**Bisection Bandwidth — 3D-torus** (N={_N}, link={_link_bw_gbs:.1f} GB/s)
|
||
|
||
```
|
||
Bisection links = N^(2/3) = {_N}^(2/3) = {_plane_links_disp} links crossing bisection plane
|
||
BW_bisection = {_plane_links_disp} × {_link_bw_gbs:.1f} GB/s / 2 = {_bw_bisect:,.1f} GB/s
|
||
BW per GPU = {_bw_bisect:,.1f} / {_N} = {_bw_per_gpu:.3f} GB/s
|
||
AllReduce time ≈ 2 × {_msg_gb} GB / {_bw_per_gpu:.3f} GB/s = {_allreduce_sec:.2f} s
|
||
```
|
||
**Scaling penalty at {_N} GPUs vs IB NDR fat-tree:**
|
||
```
|
||
IB fat-tree BW = {_N} × {IB_NDR400_EFF_GBS:.1f} / 2 = {_N * IB_NDR400_EFF_GBS / 2:.0f} GB/s
|
||
3D-torus BW = {_bw_bisect:,.1f} GB/s
|
||
Ratio = {(_N * IB_NDR400_EFF_GBS / 2) / _bw_bisect:.1f}× lower bisection BW
|
||
```
|
||
"""
|
||
else:
|
||
_formula_text = f"""
|
||
**Bisection Bandwidth — {_fabric_name}** (N={_N}, link={_link_bw_gbs:.1f} GB/s, r={_oversub_ratio:.0f}:1)
|
||
|
||
```
|
||
BW_bisection = N × link_BW / (2 × r)
|
||
= {_N} × {_link_bw_gbs:.1f} GB/s / (2 × {_oversub_ratio:.0f})
|
||
= {_bw_bisect:,.1f} GB/s ← {_topology_note}
|
||
|
||
BW per GPU = {_bw_bisect:,.1f} / {_N} = {_bw_per_gpu:.3f} GB/s
|
||
AllReduce ≈ 2 × (N-1)/N × M / (BW/N)
|
||
≈ 2 × {_msg_gb} GB / {_bw_per_gpu:.3f} GB/s
|
||
= {_allreduce_sec:.2f} s
|
||
```
|
||
"""
|
||
|
||
# ── Metric cards ──────────────────────────────────────────────────────────
|
||
_ar_color = COLORS["GreenLine"] if not _sla_violated else COLORS["RedLine"]
|
||
_cards_html = f"""
|
||
<div style="display:flex; gap:16px; flex-wrap:wrap; margin:16px 0;">
|
||
<div style="padding:18px; border:1px solid #e2e8f0; border-radius:10px;
|
||
min-width:175px; text-align:center; background:white;
|
||
border-top:3px solid {_fabric_color};">
|
||
<div style="color:#94a3b8; font-size:0.78rem; font-weight:600; margin-bottom:4px;">
|
||
Bisection BW
|
||
</div>
|
||
<div style="font-size:1.35rem; font-weight:800; color:{_fabric_color};
|
||
font-family:'SF Mono',monospace;">
|
||
{_bw_bisect:,.0f} GB/s
|
||
</div>
|
||
<div style="font-size:0.72rem; color:#94a3b8; margin-top:4px;">
|
||
{_topology_note.split(' · ')[0]}
|
||
</div>
|
||
</div>
|
||
<div style="padding:18px; border:1px solid #e2e8f0; border-radius:10px;
|
||
min-width:175px; text-align:center; background:white;
|
||
border-top:3px solid {_ar_color};">
|
||
<div style="color:#94a3b8; font-size:0.78rem; font-weight:600; margin-bottom:4px;">
|
||
AllReduce Time
|
||
</div>
|
||
<div style="font-size:1.35rem; font-weight:800; color:{_ar_color};
|
||
font-family:'SF Mono',monospace;">
|
||
{_allreduce_sec:.2f} s
|
||
</div>
|
||
<div style="font-size:0.72rem; color:#94a3b8; margin-top:4px;">
|
||
target: < {_SLA_SEC:.0f} s
|
||
</div>
|
||
</div>
|
||
<div style="padding:18px; border:1px solid #e2e8f0; border-radius:10px;
|
||
min-width:175px; text-align:center; background:white;
|
||
border-top:3px solid {_eff_color};">
|
||
<div style="color:#94a3b8; font-size:0.78rem; font-weight:600; margin-bottom:4px;">
|
||
Training Efficiency
|
||
</div>
|
||
<div style="font-size:1.35rem; font-weight:800; color:{_eff_color};
|
||
font-family:'SF Mono',monospace;">
|
||
{_train_eff_pct:.1f}%
|
||
</div>
|
||
<div style="font-size:0.72rem; color:#94a3b8; margin-top:4px;">
|
||
GPU compute fraction
|
||
</div>
|
||
</div>
|
||
<div style="padding:18px; border:1px solid #e2e8f0; border-radius:10px;
|
||
min-width:175px; text-align:center; background:white;
|
||
border-top:3px solid {COLORS['BlueLine']};">
|
||
<div style="color:#94a3b8; font-size:0.78rem; font-weight:600; margin-bottom:4px;">
|
||
Fabric Cost
|
||
</div>
|
||
<div style="font-size:1.35rem; font-weight:800; color:{COLORS['TextSec']};
|
||
font-family:'SF Mono',monospace;">
|
||
${_fabric_cost_m:.0f}M
|
||
</div>
|
||
<div style="font-size:0.72rem; color:#94a3b8; margin-top:4px;">
|
||
{_bw_per_dollar_m:,.0f} GB/s per $1M
|
||
</div>
|
||
</div>
|
||
</div>
|
||
"""
|
||
|
||
mo.vstack([
|
||
mo.Html(_cards_html),
|
||
mo.md(_formula_text),
|
||
mo.ui.plotly(_fig),
|
||
mo.ui.plotly(_fig2),
|
||
])
|
||
return (
|
||
_allreduce_sec, _bw_bisect, _sla_violated,
|
||
_fabric_name, _train_eff_pct, _fabric_cost_m,
|
||
)
|
||
|
||
|
||
# ─── ACT II: FAILURE STATE ────────────────────────────────────────────────────
|
||
|
||
|
||
@app.cell(hide_code=True)
|
||
def _(mo, act1_prediction, act1_reflection, act2_prediction,
|
||
_allreduce_sec, _sla_violated, _fabric_name):
|
||
mo.stop(act1_prediction.value is None)
|
||
mo.stop(act1_reflection.value is None)
|
||
mo.stop(act2_prediction.value is None)
|
||
|
||
if _sla_violated:
|
||
mo.callout(mo.md(
|
||
f"**Bisection bandwidth bottleneck — AllReduce SLA violated.** "
|
||
f"AllReduce requires **{_allreduce_sec:.2f} s** on **{_fabric_name}**. "
|
||
f"This exceeds the 1-second target. "
|
||
f"At this AllReduce time, GPUs spend more time waiting on gradient "
|
||
f"synchronization than computing. Training throughput is severely degraded. "
|
||
f"Reduce the tensor size, increase bisection bandwidth (lower oversubscription "
|
||
f"or switch fabric), or increase cluster link speed to meet the SLA."
|
||
), kind="danger")
|
||
else:
|
||
mo.callout(mo.md(
|
||
f"**AllReduce SLA met.** "
|
||
f"{_fabric_name} delivers AllReduce in **{_allreduce_sec:.2f} s** "
|
||
f"— within the 1-second target. GPUs remain productive."
|
||
), kind="success")
|
||
return
|
||
|
||
|
||
# ─── ACT II: PREDICTION FEEDBACK ──────────────────────────────────────────────
|
||
|
||
|
||
@app.cell(hide_code=True)
|
||
def _(mo, act1_prediction, act1_reflection, act2_prediction,
|
||
_allreduce_sec, _bw_bisect, _fabric_cost_m):
|
||
mo.stop(act1_prediction.value is None)
|
||
mo.stop(act1_reflection.value is None)
|
||
mo.stop(act2_prediction.value is None)
|
||
|
||
_act2_correct = act2_prediction.value == "pred_ib"
|
||
|
||
_act2_feedback = {
|
||
"pred_torus": (
|
||
"**Not quite.** The 3D-torus is the cheapest option, but its bisection "
|
||
"bandwidth scales as O(N^(2/3)). At 1024 GPUs, the torus bisection "
|
||
"bandwidth is roughly 10× lower than a non-blocking fat-tree. "
|
||
"AllReduce on a 1024-GPU torus will take several seconds per iteration — "
|
||
"far exceeding the 1-second target. The cost saving disappears when "
|
||
"GPU utilization drops below 50%."
|
||
),
|
||
"pred_eth": (
|
||
"**Not quite.** The 2:1 Ethernet fat-tree halves bisection bandwidth "
|
||
"compared to the IB non-blocking option. For large AllReduce tensors, "
|
||
"this 2× reduction in bisection bandwidth doubles AllReduce time. "
|
||
"At 32 GB tensor size with a 1024-GPU cluster, the Ethernet 2:1 fabric "
|
||
"likely violates the 1-second SLA. The $12M saved on fabric is lost "
|
||
"in reduced GPU utilization and extended training time."
|
||
),
|
||
"pred_ib": (
|
||
f"**Correct.** For an all-to-all AllReduce workload, bisection bandwidth "
|
||
f"is the bottleneck — not per-link speed or switch count. "
|
||
f"The IB NDR fat-tree with 1:1 non-blocking provides the highest bisection "
|
||
f"bandwidth at scale, completing AllReduce in {_allreduce_sec:.2f} s "
|
||
f"on the current configuration. The $20M fabric cost is justified: "
|
||
f"a cluster with 1024 GPUs at $25,000 each represents $25.6M in compute. "
|
||
f"A fabric that wastes 50% of compute time with slow AllReduce "
|
||
f"effectively discards $12M in GPU capacity."
|
||
),
|
||
"pred_equal": (
|
||
"**Not quite.** All three topologies perform very differently for AllReduce. "
|
||
"Link speed matters only at the local level — bisection bandwidth determines "
|
||
"the cross-cluster communication ceiling for all-to-all patterns. "
|
||
"A fat-tree and a 3D-torus with the same link speed differ by an order of "
|
||
"magnitude in AllReduce time at 1024 GPU scale because their bisection "
|
||
"bandwidths scale differently with N."
|
||
),
|
||
}
|
||
|
||
mo.callout(
|
||
mo.md(_act2_feedback[act2_prediction.value]),
|
||
kind="success" if _act2_correct else "warn",
|
||
)
|
||
return
|
||
|
||
|
||
# ─── ACT II: MATHPEEK ACCORDION ───────────────────────────────────────────────
|
||
|
||
|
||
@app.cell(hide_code=True)
|
||
def _(mo, act1_prediction, act1_reflection, act2_prediction):
|
||
mo.stop(act1_prediction.value is None)
|
||
mo.stop(act1_reflection.value is None)
|
||
mo.stop(act2_prediction.value is None)
|
||
|
||
mo.accordion({
|
||
"The governing equations: AllReduce time and topology scaling": mo.md("""
|
||
**Ring-AllReduce time (Rabenseifner's algorithm):**
|
||
|
||
```
|
||
T_allreduce = 2 × (N - 1) / N × M / BW_per_gpu
|
||
≈ 2 × M / BW_per_gpu for large N
|
||
|
||
where BW_per_gpu = BW_bisection / N
|
||
```
|
||
|
||
- **M** — AllReduce message size (gradient tensor, GB)
|
||
- **N** — Number of GPUs
|
||
- **BW_per_gpu** — Each GPU's share of bisection bandwidth
|
||
- Factor **2** — Reduce-scatter phase + all-gather phase
|
||
|
||
Substituting bisection bandwidth for each topology:
|
||
|
||
```
|
||
Fat-tree (r oversubscription):
|
||
BW_bisection = N × link_BW / (2 × r)
|
||
T_allreduce ≈ 2 × M × 2 × r / link_BW
|
||
= 4 × r × M / link_BW ← independent of N!
|
||
|
||
3D-torus:
|
||
BW_bisection = N^(2/3) × link_BW / 2
|
||
BW_per_gpu = N^(2/3) × link_BW / (2 × N) = link_BW / (2 × N^(1/3))
|
||
T_allreduce ≈ 4 × M × N^(1/3) / link_BW ← grows as N^(1/3)!
|
||
```
|
||
|
||
**The critical insight:**
|
||
|
||
For fat-tree, AllReduce time is **independent of cluster size N** (assuming
|
||
fixed link_BW and oversubscription). Every GPU added brings its own uplink,
|
||
keeping per-GPU bandwidth constant.
|
||
|
||
For 3D-torus, AllReduce time grows as **N^(1/3)** — at 8,000 GPUs, it is
|
||
20× slower than at 1 GPU on the same link speed.
|
||
|
||
**Cost-performance trade-off:**
|
||
|
||
| Topology | AllReduce scaling | Cost (1024 GPU) | BW/dollar |
|
||
|------------------|-------------------|-----------------|-----------|
|
||
| IB NDR fat-tree | O(1) | $20M | high |
|
||
| Eth fat-tree 2:1 | O(1) but 2× slower| $8M | medium |
|
||
| 3D-torus | O(N^(1/3)) | $5M | low at scale |
|
||
|
||
The torus is cost-efficient only when clusters are small enough that
|
||
N^(1/3) × link_BW still meets the latency target.
|
||
"""),
|
||
})
|
||
return
|
||
|
||
|
||
# ─── ACT II: STRUCTURED REFLECTION ───────────────────────────────────────────
|
||
|
||
|
||
@app.cell(hide_code=True)
|
||
def _(mo, act1_prediction, act1_reflection, act2_prediction):
|
||
mo.stop(act1_prediction.value is None)
|
||
mo.stop(act1_reflection.value is None)
|
||
mo.stop(act2_prediction.value is None)
|
||
|
||
mo.md("---")
|
||
return
|
||
|
||
|
||
@app.cell(hide_code=True)
|
||
def _(mo, act1_prediction, act1_reflection, act2_prediction):
|
||
mo.stop(act1_prediction.value is None)
|
||
mo.stop(act1_reflection.value is None)
|
||
mo.stop(act2_prediction.value is None)
|
||
|
||
act2_reflection = mo.ui.radio(
|
||
options={
|
||
"A) 3D-torus has higher latency than fat-tree because of more hops between non-adjacent nodes": "r2_a",
|
||
"B) Torus bisection bandwidth scales as O(N^(2/3)) vs fat-tree O(N) — all-to-all traffic is bottlenecked by the plane boundaries": "r2_b",
|
||
"C) 3D-torus requires special AllReduce algorithms that are less efficient than ring-allreduce": "r2_c",
|
||
"D) 3D-torus topologies cannot support more than 512 GPUs due to diameter limits": "r2_d",
|
||
},
|
||
label="""**Reflection — Act II.**
|
||
Why do 3D-torus topologies perform poorly for AllReduce at large cluster scales?""",
|
||
)
|
||
act2_reflection
|
||
return (act2_reflection,)
|
||
|
||
|
||
@app.cell(hide_code=True)
|
||
def _(mo, act1_prediction, act1_reflection, act2_prediction, act2_reflection):
|
||
mo.stop(act1_prediction.value is None)
|
||
mo.stop(act1_reflection.value is None)
|
||
mo.stop(act2_prediction.value is None)
|
||
mo.stop(act2_reflection.value is None, mo.callout(
|
||
mo.md("Select your answer to see the explanation."), kind="warn"
|
||
))
|
||
|
||
_r2_correct = act2_reflection.value == "r2_b"
|
||
|
||
_r2_feedback = {
|
||
"r2_a": (
|
||
"**Not quite.** Latency (hop count) is a separate property from bandwidth. "
|
||
"A 3D-torus and a fat-tree can have similar average path lengths at the same "
|
||
"cluster size. The AllReduce degradation is purely a bandwidth phenomenon: "
|
||
"all-to-all traffic must cross the bisection, and the torus has fewer links "
|
||
"at the bisection plane than the fat-tree. More hops contribute microseconds; "
|
||
"inadequate bisection bandwidth contributes seconds."
|
||
),
|
||
"r2_b": (
|
||
"**Correct.** The torus bisection bandwidth grows as O(N^(2/3)) because "
|
||
"the bisection cut passes through a cross-sectional plane of the torus — "
|
||
"a 2D slice whose size grows as the square of the linear dimension. "
|
||
"For a cube-shaped 3D-torus of N GPUs, the bisection has N^(2/3) links. "
|
||
"Fat-tree bisection grows as O(N) because every GPU brings its own uplink "
|
||
"to the spine. At 1024 GPUs, the fat-tree has 10× the bisection bandwidth "
|
||
"of the torus per unit of link speed."
|
||
),
|
||
"r2_c": (
|
||
"**Not quite.** Standard ring-allreduce operates on any topology. "
|
||
"The algorithm itself is topology-agnostic — it works by routing gradients "
|
||
"around a logical ring of GPUs. The 3D-torus does not require special "
|
||
"AllReduce algorithms. The performance difference is purely a function of "
|
||
"available bisection bandwidth, not algorithmic efficiency."
|
||
),
|
||
"r2_d": (
|
||
"**Not quite.** 3D-torus topologies are used in some large HPC installations "
|
||
"at thousands of nodes (e.g., early Blue Gene systems used 3D/5D torus). "
|
||
"The problem is not a hard capacity limit but a scaling law: AllReduce time "
|
||
"grows as N^(1/3) on a torus. At 4096 GPUs, this produces AllReduce times "
|
||
"that are 4× slower than at 512 GPUs on the same link speed — making the "
|
||
"topology progressively worse for ML training as clusters grow."
|
||
),
|
||
}
|
||
|
||
mo.callout(
|
||
mo.md(_r2_feedback[act2_reflection.value]),
|
||
kind="success" if _r2_correct else "warn",
|
||
)
|
||
return
|
||
|
||
|
||
# ─── LEDGER SAVE + HUD ────────────────────────────────────────────────────────
|
||
|
||
|
||
@app.cell(hide_code=True)
|
||
def _(
|
||
mo, ledger, COLORS,
|
||
context_toggle,
|
||
act1_prediction, act1_reflection,
|
||
act2_prediction, act2_reflection,
|
||
act2_fabric, act2_cluster_n,
|
||
_bw_bisect, _allreduce_sec, _sla_violated, _fabric_name, _fabric_cost_m,
|
||
):
|
||
# Only save when both acts are complete
|
||
_acts_complete = (
|
||
act1_prediction.value is not None and
|
||
act1_reflection.value is not None and
|
||
act2_prediction.value is not None and
|
||
act2_reflection.value is not None
|
||
)
|
||
|
||
if _acts_complete:
|
||
ledger.save(
|
||
chapter="v2_03",
|
||
design={
|
||
"context": context_toggle.value,
|
||
"fabric_type": act2_fabric.value,
|
||
"cluster_size": act2_cluster_n.value,
|
||
"oversubscription": 2.0 if act2_fabric.value == "eth_fat_tree" else (1.0 if act2_fabric.value == "ib_fat_tree" else 0.0),
|
||
"bisection_bw_gbps": round(_bw_bisect, 1),
|
||
"act1_prediction": act1_prediction.value,
|
||
"act1_correct": act1_prediction.value == "option_b",
|
||
"act2_result": round(_allreduce_sec, 3),
|
||
"act2_decision": act2_fabric.value,
|
||
"constraint_hit": _sla_violated,
|
||
},
|
||
)
|
||
|
||
# ── HUD footer ────────────────────────────────────────────────────────────
|
||
_act1_status = act1_prediction.value is not None
|
||
_act2_status = act2_prediction.value is not None
|
||
_act1_correct = act1_prediction.value == "option_b"
|
||
_act2_correct = act2_prediction.value == "pred_ib"
|
||
|
||
_hud_items = [
|
||
("Lab", "Vol II · Lab 03", "hud-value"),
|
||
("Context", context_toggle.value, "hud-value"),
|
||
("Act I", "Correct" if _act1_correct else ("Answered" if _act1_status else "Pending"),
|
||
"hud-active" if _act1_correct else ("hud-value" if _act1_status else "hud-none")),
|
||
("Act II", "Correct" if _act2_correct else ("Answered" if _act2_status else "Pending"),
|
||
"hud-active" if _act2_correct else ("hud-value" if _act2_status else "hud-none")),
|
||
("Bisection BW", f"{_bw_bisect:,.0f} GB/s" if _act2_status else "—",
|
||
"hud-value"),
|
||
("AllReduce", f"{_allreduce_sec:.2f} s" if _act2_status else "—",
|
||
"hud-active" if (_act2_status and not _sla_violated) else ("hud-none" if (_act2_status and _sla_violated) else "hud-value")),
|
||
("SLA", "Violated" if _sla_violated else ("Met" if _act2_status else "—"),
|
||
"hud-none" if _sla_violated else ("hud-active" if _act2_status else "hud-value")),
|
||
("Ledger", "Saved" if _acts_complete else "Incomplete",
|
||
"hud-active" if _acts_complete else "hud-none"),
|
||
]
|
||
|
||
_hud_cells = "".join([
|
||
f"""<div style="display:flex; flex-direction:column; gap:2px;">
|
||
<span class="hud-label">{label}</span>
|
||
<span class="{cls}">{val}</span>
|
||
</div>"""
|
||
for label, val, cls in _hud_items
|
||
])
|
||
|
||
mo.Html(f"""
|
||
<div class="lab-hud" style="margin-top:32px;">
|
||
{_hud_cells}
|
||
</div>
|
||
""")
|
||
return
|
||
|
||
|
||
if __name__ == "__main__":
|
||
app.run()
|