mirror of
https://github.com/harvard-edge/cs249r_book.git
synced 2026-05-01 01:59:10 -05:00
Add all Vol1 (labs 01-16) and Vol2 (labs 01-17) interactive Marimo labs as the first full first-pass implementation of the ML Systems curriculum labs. Each lab follows the PROTOCOL 2-Act structure (35-40 min): - Act I: Calibration with prediction lock → instruments → overlay - Act II: Design challenge with failure states and reflection Key pedagogical instruments introduced progressively: - Vol1: D·A·M Triad, Iron Law, Memory Ledger, Roofline, Amdahl's Law, Little's Law, P99 Histogram, Compression Frontier, Chouldechova theorem - Vol2: NVLink vs PCIe cliff, Bisection BW, Young-Daly T*, Parallelism Paradox, AllReduce ring vs tree, KV-cache model, Jevons Paradox, DP ε-δ tradeoff, SLO composition, Adversarial Pareto, two-volume synthesis capstone All 35 staged files pass AST syntax verification (36/36 including lab_00). Also includes: - labs/LABS_SPEC.md: authoritative sub-agent brief for all lab conventions - labs/core/style.py: expanded unified design system with semantic color tokens
1559 lines
76 KiB
Python
1559 lines
76 KiB
Python
import marimo
|
||
|
||
__generated_with = "0.19.6"
|
||
app = marimo.App(width="full")
|
||
|
||
# ─────────────────────────────────────────────────────────────────────────────
|
||
# LAB 04: THE DATA GRAVITY TRAP — DISTRIBUTED SCALE
|
||
#
|
||
# Chapter: data_storage.qmd (@sec-data-storage)
|
||
# Core Invariant: Data gravity — once data is stored somewhere, computation
|
||
# tends to move TO the data rather than data moving to compute. At
|
||
# distributed training scale, the I/O system becomes the bottleneck when
|
||
# reading training data. Distributed file systems (Lustre, GPFS) provide
|
||
# aggregate bandwidth but introduce coordination overhead.
|
||
#
|
||
# 2-Act structure (35-40 min total):
|
||
# Act I: The I/O Bottleneck (12-15 min)
|
||
# 128 H100s at 12% GPU utilization despite a 672 GB/s Lustre cluster.
|
||
# Prediction lock → I/O analyzer → reveal → reflection → MathPeek
|
||
# Act II: Distributed Storage Architecture (20-25 min)
|
||
# Design a 4096-GPU cluster's storage layer. Metadata bottleneck lurks.
|
||
# Prediction lock → storage designer → failure state → reflection → MathPeek
|
||
#
|
||
# Deployment contexts:
|
||
# NVMe: Single-node local NVMe (7 GB/s per drive, 4 drives = 28 GB/s)
|
||
# Distributed FS: Lustre/GPFS cluster (OST nodes × 28 GB/s each, MDS ceiling)
|
||
#
|
||
# Traceability:
|
||
# H100 HBM bandwidth — @sec-data-storage-fuel-line (H100_MEM_BW = 3.35 TB/s)
|
||
# NVMe sequential BW — @sec-data-storage-fuel-line (NVME_SEQUENTIAL_BW = 7 GB/s)
|
||
# Lustre OST BW — @sec-data-storage (4 × NVMe per OST = 28 GB/s)
|
||
# GPU utilization formula — @sec-data-storage (t_compute / t_total)
|
||
# Prefetch depth model — @sec-data-storage (queue depth × batch_read_time)
|
||
# Metadata ops rate — @sec-data-storage (gpus × workers × ops_per_open)
|
||
# IB HDR200 bandwidth — @sec-network-fabrics (200 Gbps = 25 GB/s per port)
|
||
#
|
||
# Design Ledger save:
|
||
# chapter="v2_04", context, storage_type, gpu_count, data_workers_per_gpu,
|
||
# io_throughput_gbs, act1_prediction, act1_correct, act2_result,
|
||
# act2_decision, constraint_hit, mds_saturated
|
||
# ─────────────────────────────────────────────────────────────────────────────
|
||
|
||
|
||
# ─── CELL 0: SETUP (hide_code=False — leave visible) ─────────────────────────
|
||
@app.cell
|
||
def _():
|
||
import marimo as mo
|
||
import sys
|
||
import math
|
||
from pathlib import Path
|
||
import plotly.graph_objects as go
|
||
import numpy as np
|
||
|
||
_root = Path(__file__).resolve().parents[2]
|
||
if str(_root) not in sys.path:
|
||
sys.path.insert(0, str(_root))
|
||
|
||
from labs.core.state import DesignLedger
|
||
from labs.core.style import COLORS, LAB_CSS, apply_plotly_theme
|
||
|
||
ledger = DesignLedger()
|
||
return mo, ledger, COLORS, LAB_CSS, apply_plotly_theme, go, np, math
|
||
|
||
|
||
# ─── CELL 1: HEADER ──────────────────────────────────────────────────────────
|
||
@app.cell(hide_code=True)
|
||
def _(mo, LAB_CSS, COLORS):
|
||
_c = COLORS["Cloud"]
|
||
mo.vstack([
|
||
LAB_CSS,
|
||
mo.Html(f"""
|
||
<div style="background: linear-gradient(135deg, #0f172a 0%, #1e293b 100%);
|
||
padding: 36px 44px; border-radius: 16px; color: white;
|
||
box-shadow: 0 8px 32px rgba(0,0,0,0.3); margin-bottom: 8px;">
|
||
<div style="font-size: 0.72rem; font-weight: 700; letter-spacing: 0.18em;
|
||
color: #475569; text-transform: uppercase; margin-bottom: 10px;">
|
||
Machine Learning Systems · Volume II · Lab 04
|
||
</div>
|
||
<h1 style="margin: 0 0 10px 0; font-size: 2.2rem; font-weight: 900;
|
||
color: #f8fafc; line-height: 1.1; letter-spacing: -0.02em;">
|
||
The I/O Bottleneck
|
||
</h1>
|
||
<p style="margin: 0 0 22px 0; font-size: 1.02rem; color: #94a3b8;
|
||
max-width: 680px; line-height: 1.65;">
|
||
128 H100s sit at 12% GPU utilization despite a 672 GB/s Lustre cluster.
|
||
The storage system looks fine on paper. The accelerators are starving.
|
||
Before buying more hardware, you need to find the actual bottleneck.
|
||
</p>
|
||
<div style="display: flex; gap: 12px; flex-wrap: wrap; align-items: center;">
|
||
<span style="background: rgba(99,102,241,0.15); color: #a5b4fc;
|
||
padding: 5px 14px; border-radius: 20px; font-size: 0.8rem;
|
||
font-weight: 600; border: 1px solid rgba(99,102,241,0.25);">
|
||
Act I: I/O Bottleneck · 12–15 min
|
||
</span>
|
||
<span style="background: rgba(99,102,241,0.15); color: #a5b4fc;
|
||
padding: 5px 14px; border-radius: 20px; font-size: 0.8rem;
|
||
font-weight: 600; border: 1px solid rgba(99,102,241,0.25);">
|
||
Act II: Storage Architecture · 20–25 min
|
||
</span>
|
||
<span style="background: rgba(16,185,129,0.15); color: #6ee7b7;
|
||
padding: 5px 14px; border-radius: 20px; font-size: 0.8rem;
|
||
font-weight: 600; border: 1px solid rgba(16,185,129,0.25);">
|
||
35–40 min total
|
||
</span>
|
||
<span class="badge badge-info">Chapter 4: Data Storage</span>
|
||
</div>
|
||
</div>
|
||
"""),
|
||
])
|
||
return
|
||
|
||
|
||
# ─── CELL 2: RECOMMENDED READING ─────────────────────────────────────────────
|
||
@app.cell(hide_code=True)
|
||
def _(mo):
|
||
mo.callout(mo.md("""
|
||
**Recommended Reading** — Complete the following before this lab:
|
||
|
||
- **@sec-data-storage-fuel-line** (The Fuel Line) — The 479× gap between H100 HBM
|
||
bandwidth (3.35 TB/s) and a single NVMe drive (7 GB/s), and why this gap cannot be
|
||
closed by any single technology. The hierarchical storage model is the only response.
|
||
- **@sec-data-storage-workload-inversion** (How ML Workloads Invert Storage Assumptions)
|
||
— Why ML training reads every byte sequentially exactly once per epoch, why there is
|
||
no hot-data subset, and how this inverts every assumption from database storage design.
|
||
- **@sec-data-storage** (Data Pipeline Architecture) — Prefetching, pipelining, and the
|
||
GPU Direct Storage pathway that eliminates the CPU from the data path entirely.
|
||
"""), kind="info")
|
||
return
|
||
|
||
|
||
# ─── CELL 3: CONTEXT TOGGLE ──────────────────────────────────────────────────
|
||
@app.cell(hide_code=True)
|
||
def _(mo):
|
||
context_toggle = mo.ui.radio(
|
||
options={
|
||
"NVMe (single node, 4 drives at 7 GB/s each)": "nvme",
|
||
"Distributed FS (Lustre/GPFS cluster, OST nodes)": "distributed_fs",
|
||
},
|
||
value="Distributed FS (Lustre/GPFS cluster, OST nodes)",
|
||
label="Storage context:",
|
||
inline=True,
|
||
)
|
||
mo.vstack([
|
||
mo.md("---"),
|
||
mo.md("### Select Your Storage Context"),
|
||
mo.md(
|
||
"This choice sets the storage architecture for both acts. "
|
||
"NVMe reflects a single training node reading local drives. "
|
||
"Distributed FS reflects a production multi-node Lustre/GPFS deployment."
|
||
),
|
||
context_toggle,
|
||
])
|
||
return (context_toggle,)
|
||
|
||
|
||
@app.cell(hide_code=True)
|
||
def _(mo, context_toggle, COLORS):
|
||
_ctx = context_toggle.value
|
||
if _ctx == "nvme":
|
||
_bw_desc = "4 × NVMe = 28 GB/s local storage bandwidth per node"
|
||
_arch_desc = "No metadata server overhead — direct kernel I/O"
|
||
_color = COLORS["GreenLine"]
|
||
_label = "NVMe — Single Node"
|
||
else:
|
||
_bw_desc = "24 OST nodes × 28 GB/s = 672 GB/s aggregate Lustre bandwidth"
|
||
_arch_desc = "Metadata server (MDS) coordinates all namespace operations"
|
||
_color = COLORS["Cloud"]
|
||
_label = "Distributed FS — Lustre/GPFS"
|
||
|
||
mo.callout(mo.md(
|
||
f"**Context: {_label}** — Storage bandwidth: {_bw_desc}. "
|
||
f"Architecture note: {_arch_desc}. "
|
||
"The I/O bottleneck analysis in Act I will use these parameters."
|
||
), kind="info")
|
||
return
|
||
|
||
|
||
# ═════════════════════════════════════════════════════════════════════════════
|
||
# ACT I: THE I/O BOTTLENECK
|
||
# ═════════════════════════════════════════════════════════════════════════════
|
||
|
||
@app.cell(hide_code=True)
|
||
def _(mo, COLORS):
|
||
_c = COLORS["BlueLine"]
|
||
mo.Html(f"""
|
||
<div style="margin: 32px 0 8px 0;">
|
||
<div style="font-size: 0.72rem; font-weight: 700; letter-spacing: 0.14em;
|
||
text-transform: uppercase; color: {_c}; margin-bottom: 6px;">
|
||
Act I · 12–15 minutes
|
||
</div>
|
||
<div style="font-size: 1.6rem; font-weight: 800; color: #0f172a; line-height: 1.2;">
|
||
The I/O Bottleneck
|
||
</div>
|
||
<div style="font-size: 0.95rem; color: #475569; margin-top: 6px; max-width: 700px;">
|
||
Your training platform has 128 H100 GPUs and a 24-node Lustre cluster providing
|
||
672 GB/s of aggregate storage bandwidth. GPU utilization is at 12%.
|
||
Before touching the simulator, commit to a diagnosis.
|
||
</div>
|
||
</div>
|
||
""")
|
||
return
|
||
|
||
|
||
# ─── ACT I: STAKEHOLDER MESSAGE ──────────────────────────────────────────────
|
||
@app.cell(hide_code=True)
|
||
def _(mo, COLORS):
|
||
_c = COLORS["OrangeLine"]
|
||
mo.Html(f"""
|
||
<div style="border-left: 4px solid {_c}; background: {COLORS['OrangeLL']};
|
||
border-radius: 0 10px 10px 0; padding: 16px 22px; margin: 12px 0;">
|
||
<div style="font-size: 0.72rem; font-weight: 700; color: {_c};
|
||
text-transform: uppercase; letter-spacing: 0.1em; margin-bottom: 6px;">
|
||
Incoming Message · Training Platform Lead
|
||
</div>
|
||
<div style="font-style: italic; font-size: 1.0rem; color: #1e293b; line-height: 1.65;">
|
||
"We have 128 H100s, each capable of consuming training data at 3.35 TB/s from HBM.
|
||
We store our training data on a 24-node Lustre cluster — each OST node has 4 NVMe
|
||
SSDs at 7 GB/s each, giving 28 GB/s per node and 672 GB/s in aggregate.
|
||
Our GPUs are sitting at 12% utilization during the data loading phase.
|
||
The infrastructure team says the Lustre cluster is healthy and nowhere near saturated.
|
||
We're about to request budget for another 256 GPUs. Should we?"
|
||
</div>
|
||
</div>
|
||
""")
|
||
return
|
||
|
||
|
||
# ─── ACT I: THE NUMBERS ──────────────────────────────────────────────────────
|
||
@app.cell(hide_code=True)
|
||
def _(mo, COLORS):
|
||
_c = COLORS["BlueLine"]
|
||
mo.Html(f"""
|
||
<div style="background: #f8fafc; border: 1px solid #e2e8f0; border-radius: 12px;
|
||
padding: 20px 24px; margin: 12px 0;">
|
||
<div style="font-size: 0.82rem; font-weight: 700; color: #475569; text-transform: uppercase;
|
||
letter-spacing: 0.08em; margin-bottom: 14px;">
|
||
The Arithmetic Before You Touch the Simulator
|
||
</div>
|
||
<div style="display: grid; grid-template-columns: 1fr 1fr; gap: 16px;">
|
||
<div>
|
||
<div style="font-size: 0.82rem; color: #94a3b8; margin-bottom: 4px;">
|
||
H100 HBM bandwidth (NVIDIA spec)
|
||
</div>
|
||
<div style="font-size: 1.2rem; font-weight: 700; color: {_c};">
|
||
3,350 GB/s per GPU
|
||
</div>
|
||
</div>
|
||
<div>
|
||
<div style="font-size: 0.82rem; color: #94a3b8; margin-bottom: 4px;">
|
||
Lustre aggregate bandwidth (24 OSTs × 28 GB/s)
|
||
</div>
|
||
<div style="font-size: 1.2rem; font-weight: 700; color: {_c};">
|
||
672 GB/s total
|
||
</div>
|
||
</div>
|
||
<div>
|
||
<div style="font-size: 0.82rem; color: #94a3b8; margin-bottom: 4px;">
|
||
Actual data ingestion needed per GPU (4 MB batch / 50 ms step)
|
||
</div>
|
||
<div style="font-size: 1.2rem; font-weight: 700; color: {COLORS['GreenLine']};">
|
||
~80 MB/s per GPU
|
||
</div>
|
||
</div>
|
||
<div>
|
||
<div style="font-size: 0.82rem; color: #94a3b8; margin-bottom: 4px;">
|
||
Total I/O demand for 128 GPUs (128 × 80 MB/s)
|
||
</div>
|
||
<div style="font-size: 1.2rem; font-weight: 700; color: {COLORS['GreenLine']};">
|
||
~10.24 GB/s needed
|
||
</div>
|
||
</div>
|
||
</div>
|
||
<div style="margin-top: 14px; font-size: 0.88rem; color: #475569; line-height: 1.65;
|
||
border-top: 1px solid #e2e8f0; padding-top: 12px;">
|
||
<strong>The gap is striking:</strong> 672 GB/s of Lustre capacity against only 10.24 GB/s
|
||
of actual demand. The cluster is at ~1.5% utilization. The Lustre hardware is
|
||
not the problem. Something else is consuming the time between batches.
|
||
</div>
|
||
</div>
|
||
""")
|
||
return
|
||
|
||
|
||
# ─── ACT I: PREDICTION LOCK ──────────────────────────────────────────────────
|
||
@app.cell(hide_code=True)
|
||
def _(mo):
|
||
mo.md("""
|
||
#### Your Prediction
|
||
|
||
*Before interacting with the I/O analyzer, commit to your diagnosis.*
|
||
The storage cluster is at ~1.5% utilization. The GPUs are at 12%. The gap is real.
|
||
""")
|
||
return
|
||
|
||
|
||
@app.cell(hide_code=True)
|
||
def _(mo):
|
||
act1_prediction = mo.ui.radio(
|
||
options={
|
||
"A) The Lustre cluster is saturated — 672 GB/s is not enough for 128 GPUs": "A",
|
||
"B) Small random reads — the access pattern does not match Lustre's sequential throughput profile": "B",
|
||
"C) Network bandwidth between the Lustre nodes and the GPU compute nodes is saturated": "C",
|
||
"D) 12% utilization is normal — data loading always accounts for 88% of training time": "D",
|
||
},
|
||
label="GPU utilization is 12% despite a 672 GB/s Lustre cluster. The most likely root cause is:",
|
||
)
|
||
act1_prediction
|
||
return (act1_prediction,)
|
||
|
||
|
||
@app.cell(hide_code=True)
|
||
def _(mo, act1_prediction):
|
||
mo.stop(
|
||
act1_prediction.value is None,
|
||
mo.callout(
|
||
mo.md("Select your prediction to unlock the I/O bottleneck analyzer."),
|
||
kind="warn",
|
||
),
|
||
)
|
||
mo.callout(
|
||
mo.md(
|
||
f"**Prediction locked: {act1_prediction.value[:2]}** "
|
||
"Now use the I/O analyzer to test your hypothesis. "
|
||
"Adjust the data loader configuration to see what moves the needle."
|
||
),
|
||
kind="info",
|
||
)
|
||
return
|
||
|
||
|
||
# ─── ACT I: INSTRUMENTS ──────────────────────────────────────────────────────
|
||
@app.cell(hide_code=True)
|
||
def _(mo):
|
||
mo.md("""
|
||
#### The I/O Bottleneck Analyzer
|
||
|
||
Adjust the data loading configuration below to see how GPU utilization responds.
|
||
The physics engine models the relationship between prefetch depth, worker count,
|
||
read pattern, and effective I/O throughput.
|
||
""")
|
||
return
|
||
|
||
|
||
@app.cell(hide_code=True)
|
||
def _(mo):
|
||
act1_num_workers = mo.ui.slider(
|
||
start=1, stop=16, value=1, step=1,
|
||
label="Data loader workers per GPU",
|
||
show_value=True,
|
||
)
|
||
act1_batch_size = mo.ui.slider(
|
||
start=32, stop=512, value=64, step=32,
|
||
label="Batch size (samples)",
|
||
show_value=True,
|
||
)
|
||
act1_read_pattern = mo.ui.dropdown(
|
||
options={
|
||
"Sequential (large chunks, Lustre-optimal)": "sequential",
|
||
"Random (small 4K reads, worst case)": "random",
|
||
"Prefetch pipeline (async, overlapped with compute)": "prefetch",
|
||
},
|
||
value="Random (small 4K reads, worst case)",
|
||
label="Read pattern",
|
||
)
|
||
act1_ost_count = mo.ui.slider(
|
||
start=4, stop=48, value=24, step=4,
|
||
label="Active Lustre OST nodes",
|
||
show_value=True,
|
||
)
|
||
mo.hstack([
|
||
act1_num_workers,
|
||
act1_batch_size,
|
||
act1_read_pattern,
|
||
act1_ost_count,
|
||
], gap=2, justify="start")
|
||
return (act1_num_workers, act1_batch_size, act1_read_pattern, act1_ost_count)
|
||
|
||
|
||
@app.cell(hide_code=True)
|
||
def _(
|
||
mo, act1_num_workers, act1_batch_size, act1_read_pattern, act1_ost_count,
|
||
context_toggle, go, apply_plotly_theme, COLORS, math,
|
||
):
|
||
# ── Hardware constants ─────────────────────────────────────────────────────
|
||
# Source: @sec-data-storage-fuel-line
|
||
_H100_BW_GBS = 3350.0 # H100 SXM5 HBM3e bandwidth, NVIDIA spec sheet
|
||
_NVME_SEQ_BW_GBS = 7.0 # PCIe Gen4 NVMe sequential read bandwidth
|
||
_NVME_RAND_BW_GBS = 0.5 # NVMe random 4K read effective throughput
|
||
_LUSTRE_OST_BW_GBS = 28.0 # Per OST node (4 × NVMe at 7 GB/s each)
|
||
_IB_HDR200_BW_GBS = 25.0 # InfiniBand HDR-200 per port (200 Gbps = 25 GB/s)
|
||
|
||
# ── Workload parameters ────────────────────────────────────────────────────
|
||
_GPU_COUNT = 128 # fixed for Act I scenario
|
||
_SAMPLE_SIZE_MB = 4.0 # 4 MB per training sample (typical image/text)
|
||
_COMPUTE_TIME_MS = 50.0 # GPU compute time per batch (H100, realistic)
|
||
|
||
# ── Context-dependent base bandwidth ──────────────────────────────────────
|
||
_ctx = context_toggle.value
|
||
if _ctx == "nvme":
|
||
# Single node: 4 × NVMe = 28 GB/s, no network overhead
|
||
_base_storage_bw = 28.0 # GB/s (4 × 7 GB/s NVMe)
|
||
_network_overhead = 1.0 # no network factor for local NVMe
|
||
else:
|
||
# Distributed Lustre: aggregate bandwidth from active OSTs
|
||
_base_storage_bw = act1_ost_count.value * _LUSTRE_OST_BW_GBS
|
||
# Network overhead: IB connects compute to storage, capped at IB bandwidth
|
||
# 128 GPUs × 1 IB port per node = need aggregate fabric bandwidth
|
||
_network_overhead = min(1.0, (_IB_HDR200_BW_GBS * 16) / _base_storage_bw)
|
||
|
||
# ── Read pattern factor ────────────────────────────────────────────────────
|
||
# Sequential reads can sustain near-peak bandwidth on Lustre
|
||
# Random 4K reads hit IOPS wall: OST NVMe at ~800K IOPS × 4KB = 3.2 GB/s per OST
|
||
# Prefetch: async pipeline overlaps I/O with compute, hides much of the latency
|
||
_pattern_factor = {
|
||
"sequential": 0.85, # near-peak sequential read efficiency
|
||
"random": 0.07, # 4K random read: ~7% of sequential bandwidth
|
||
"prefetch": 0.78, # async prefetch hides latency, near-sequential efficiency
|
||
}[act1_read_pattern.value]
|
||
|
||
# ── Worker scaling ─────────────────────────────────────────────────────────
|
||
# Workers per GPU parallelize reads across CPU cores
|
||
# Diminishing returns: Amdahl-style. Beyond 8 workers, OS scheduling overhead grows.
|
||
# Source: empirical characterization from @sec-data-storage
|
||
_workers = act1_num_workers.value
|
||
if _workers == 1:
|
||
_worker_factor = 0.25 # 1 worker: sequential, blocking reads — severe bottleneck
|
||
elif _workers <= 4:
|
||
_worker_factor = 0.25 + (_workers - 1) * 0.20 # near-linear scaling 1–4
|
||
elif _workers <= 8:
|
||
_worker_factor = 0.85 + (_workers - 4) * 0.03 # sub-linear scaling 4–8
|
||
else:
|
||
_worker_factor = min(0.97 + (_workers - 8) * 0.002, 1.0) # minimal gain > 8
|
||
|
||
# ── Prefetch queue depth ───────────────────────────────────────────────────
|
||
# Prefetch depth: how many batches ahead we read asynchronously
|
||
# depth = workers × overlap_factor (pattern-dependent)
|
||
_overlap_factor = {"sequential": 2.0, "random": 0.5, "prefetch": 4.0}[act1_read_pattern.value]
|
||
_prefetch_depth = max(1, int(_workers * _overlap_factor))
|
||
|
||
# ── Effective I/O bandwidth per GPU ───────────────────────────────────────
|
||
_effective_storage_bw = _base_storage_bw * _pattern_factor * _worker_factor * _network_overhead
|
||
# Per-GPU share of the storage bandwidth
|
||
_per_gpu_bw = _effective_storage_bw / _GPU_COUNT # GB/s per GPU
|
||
|
||
# ── Batch load time and GPU utilization ───────────────────────────────────
|
||
_batch_gb = (act1_batch_size.value * _SAMPLE_SIZE_MB) / 1024.0
|
||
_t_io_ms = (_batch_gb / _per_gpu_bw) * 1000.0 if _per_gpu_bw > 0 else 9999.0
|
||
|
||
# Prefetch pipeline: if prefetch, I/O and compute overlap
|
||
# Effective wait = max(0, t_io - compute_time × prefetch_depth)
|
||
if act1_read_pattern.value == "prefetch":
|
||
_t_wait_ms = max(0.0, _t_io_ms - _COMPUTE_TIME_MS * min(_prefetch_depth, 4))
|
||
else:
|
||
_t_wait_ms = max(0.0, _t_io_ms - _COMPUTE_TIME_MS)
|
||
|
||
_t_total_ms = _COMPUTE_TIME_MS + _t_wait_ms
|
||
_gpu_util = min((_COMPUTE_TIME_MS / _t_total_ms) * 100.0, 100.0) if _t_total_ms > 0 else 0.0
|
||
|
||
# ── Color coding ──────────────────────────────────────────────────────────
|
||
if _gpu_util >= 80:
|
||
_util_color = COLORS["GreenLine"]
|
||
_util_label = "Healthy"
|
||
elif _gpu_util >= 50:
|
||
_util_color = COLORS["OrangeLine"]
|
||
_util_label = "Degraded"
|
||
else:
|
||
_util_color = COLORS["RedLine"]
|
||
_util_label = "Starved"
|
||
|
||
# ── GPU utilization vs num_workers curve ──────────────────────────────────
|
||
_worker_range = list(range(1, 17))
|
||
_util_curve = []
|
||
for _w in _worker_range:
|
||
if _w == 1:
|
||
_wf = 0.25
|
||
elif _w <= 4:
|
||
_wf = 0.25 + (_w - 1) * 0.20
|
||
elif _w <= 8:
|
||
_wf = 0.85 + (_w - 4) * 0.03
|
||
else:
|
||
_wf = min(0.97 + (_w - 8) * 0.002, 1.0)
|
||
_eff_bw_w = _base_storage_bw * _pattern_factor * _wf * _network_overhead
|
||
_per_gpu_bw_w = _eff_bw_w / _GPU_COUNT
|
||
_t_io_w = (_batch_gb / _per_gpu_bw_w) * 1000.0 if _per_gpu_bw_w > 0 else 9999.0
|
||
_ov_f = {"sequential": 2.0, "random": 0.5, "prefetch": 4.0}[act1_read_pattern.value]
|
||
_pd_w = max(1, int(_w * _ov_f))
|
||
if act1_read_pattern.value == "prefetch":
|
||
_tw = max(0.0, _t_io_w - _COMPUTE_TIME_MS * min(_pd_w, 4))
|
||
else:
|
||
_tw = max(0.0, _t_io_w - _COMPUTE_TIME_MS)
|
||
_tt = _COMPUTE_TIME_MS + _tw
|
||
_u = min((_COMPUTE_TIME_MS / _tt) * 100.0, 100.0) if _tt > 0 else 0.0
|
||
_util_curve.append(_u)
|
||
|
||
_fig = go.Figure()
|
||
_fig.add_trace(go.Scatter(
|
||
x=_worker_range,
|
||
y=_util_curve,
|
||
mode="lines+markers",
|
||
line=dict(color=COLORS["BlueLine"], width=2.5),
|
||
marker=dict(size=7, color=COLORS["BlueLine"]),
|
||
name="GPU Utilization (%)",
|
||
))
|
||
# Highlight current workers setting
|
||
_fig.add_trace(go.Scatter(
|
||
x=[_workers],
|
||
y=[_gpu_util],
|
||
mode="markers",
|
||
marker=dict(size=14, color=_util_color, symbol="circle", line=dict(width=2, color="white")),
|
||
name=f"Current: {_workers} workers → {_gpu_util:.0f}%",
|
||
))
|
||
# Target line at 80%
|
||
_fig.add_hline(
|
||
y=80,
|
||
line_dash="dash",
|
||
line_color=COLORS["GreenLine"],
|
||
annotation_text="Target: 80% GPU utilization",
|
||
annotation_position="top right",
|
||
)
|
||
_fig.update_layout(
|
||
height=300,
|
||
xaxis_title="Data loader workers per GPU",
|
||
yaxis_title="GPU Utilization (%)",
|
||
yaxis=dict(range=[0, 105]),
|
||
showlegend=True,
|
||
legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="left", x=0),
|
||
margin=dict(l=50, r=20, t=50, b=50),
|
||
title=dict(
|
||
text=f"GPU Utilization vs. Workers ({act1_read_pattern.value}, {act1_ost_count.value} OSTs)",
|
||
font=dict(size=13),
|
||
),
|
||
)
|
||
apply_plotly_theme(_fig)
|
||
|
||
# ── Metric cards ──────────────────────────────────────────────────────────
|
||
_lustre_util_pct = (_effective_storage_bw / (_base_storage_bw + 0.001)) * 100.0
|
||
_lustre_color = COLORS["RedLine"] if _lustre_util_pct < 20 else COLORS["OrangeLine"] if _lustre_util_pct < 60 else COLORS["GreenLine"]
|
||
|
||
_cards_html = f"""
|
||
<div style="display: flex; gap: 16px; flex-wrap: wrap; margin: 16px 0;">
|
||
<div style="padding: 18px 22px; border: 1px solid #e2e8f0; border-radius: 12px;
|
||
min-width: 165px; text-align: center; background: white;
|
||
border-top: 4px solid {_util_color};">
|
||
<div style="color: #94a3b8; font-size: 0.82rem; font-weight: 600; margin-bottom: 6px;">
|
||
GPU Utilization
|
||
</div>
|
||
<div style="font-size: 2.2rem; font-weight: 800; color: {_util_color}; line-height: 1;">
|
||
{_gpu_util:.0f}%
|
||
</div>
|
||
<div style="color: {_util_color}; font-size: 0.78rem; font-weight: 700; margin-top: 4px;">
|
||
{_util_label}
|
||
</div>
|
||
</div>
|
||
<div style="padding: 18px 22px; border: 1px solid #e2e8f0; border-radius: 12px;
|
||
min-width: 165px; text-align: center; background: white;">
|
||
<div style="color: #94a3b8; font-size: 0.82rem; font-weight: 600; margin-bottom: 6px;">
|
||
Effective I/O BW
|
||
</div>
|
||
<div style="font-size: 2.2rem; font-weight: 800; color: {COLORS['BlueLine']}; line-height: 1;">
|
||
{_effective_storage_bw:.1f}
|
||
</div>
|
||
<div style="color: #94a3b8; font-size: 0.78rem; margin-top: 4px;">GB/s total</div>
|
||
</div>
|
||
<div style="padding: 18px 22px; border: 1px solid #e2e8f0; border-radius: 12px;
|
||
min-width: 165px; text-align: center; background: white;">
|
||
<div style="color: #94a3b8; font-size: 0.82rem; font-weight: 600; margin-bottom: 6px;">
|
||
I/O Wait
|
||
</div>
|
||
<div style="font-size: 2.2rem; font-weight: 800; color: {_util_color}; line-height: 1;">
|
||
{_t_wait_ms:.1f}
|
||
</div>
|
||
<div style="color: #94a3b8; font-size: 0.78rem; margin-top: 4px;">ms per batch</div>
|
||
</div>
|
||
<div style="padding: 18px 22px; border: 1px solid #e2e8f0; border-radius: 12px;
|
||
min-width: 165px; text-align: center; background: white;">
|
||
<div style="color: #94a3b8; font-size: 0.82rem; font-weight: 600; margin-bottom: 6px;">
|
||
Prefetch Queue
|
||
</div>
|
||
<div style="font-size: 2.2rem; font-weight: 800; color: {COLORS['BlueLine']}; line-height: 1;">
|
||
{_prefetch_depth}
|
||
</div>
|
||
<div style="color: #94a3b8; font-size: 0.78rem; margin-top: 4px;">batches ahead</div>
|
||
</div>
|
||
<div style="padding: 18px 22px; border: 1px solid #e2e8f0; border-radius: 12px;
|
||
min-width: 165px; text-align: center; background: white;">
|
||
<div style="color: #94a3b8; font-size: 0.82rem; font-weight: 600; margin-bottom: 6px;">
|
||
Storage Utilization
|
||
</div>
|
||
<div style="font-size: 2.2rem; font-weight: 800; color: {_lustre_color}; line-height: 1;">
|
||
{_lustre_util_pct:.0f}%
|
||
</div>
|
||
<div style="color: #94a3b8; font-size: 0.78rem; margin-top: 4px;">of capacity</div>
|
||
</div>
|
||
</div>
|
||
"""
|
||
|
||
# ── Physics formula display ───────────────────────────────────────────────
|
||
_formula_md = f"""
|
||
**The physics (from @sec-data-storage):**
|
||
|
||
```
|
||
Effective BW = Storage_BW × pattern_factor × worker_factor × network_overhead
|
||
= {_base_storage_bw:.0f} GB/s × {_pattern_factor:.2f} × {_worker_factor:.2f} × {_network_overhead:.2f}
|
||
= {_effective_storage_bw:.1f} GB/s
|
||
|
||
Per-GPU BW = Effective_BW / GPU_count
|
||
= {_effective_storage_bw:.1f} GB/s / {_GPU_COUNT}
|
||
= {_per_gpu_bw:.4f} GB/s = {_per_gpu_bw * 1024:.1f} MB/s
|
||
|
||
I/O time = batch_bytes / per_gpu_bw
|
||
= {_batch_gb * 1024:.1f} MB / {_per_gpu_bw * 1024:.1f} MB/s
|
||
= {_t_io_ms:.1f} ms
|
||
|
||
GPU Util = t_compute / (t_compute + t_io_wait)
|
||
= {_COMPUTE_TIME_MS:.1f} ms / ({_COMPUTE_TIME_MS:.1f} + {_t_wait_ms:.1f}) ms
|
||
= {_gpu_util:.1f}%
|
||
```
|
||
"""
|
||
|
||
mo.vstack([
|
||
mo.Html(_cards_html),
|
||
mo.as_html(_fig),
|
||
mo.md(_formula_md),
|
||
])
|
||
return (
|
||
_gpu_util,
|
||
_t_wait_ms,
|
||
_effective_storage_bw,
|
||
_per_gpu_bw,
|
||
_base_storage_bw,
|
||
_pattern_factor,
|
||
_worker_factor,
|
||
_prefetch_depth,
|
||
)
|
||
|
||
|
||
# ─── ACT I: PREDICTION-VS-REALITY OVERLAY ────────────────────────────────────
|
||
@app.cell(hide_code=True)
|
||
def _(mo, act1_prediction, _gpu_util):
|
||
_pred_text = {
|
||
"A": "Lustre cluster saturated (too little storage bandwidth)",
|
||
"B": "Small random reads — access pattern mismatch",
|
||
"C": "Network bandwidth between Lustre and GPU nodes saturated",
|
||
"D": "12% GPU utilization is normal for data loading",
|
||
}[act1_prediction.value]
|
||
|
||
_is_correct = act1_prediction.value == "B"
|
||
|
||
if _is_correct:
|
||
mo.callout(mo.md(
|
||
f"**Correct. You predicted: {_pred_text}.**\n\n"
|
||
f"The simulation confirms it. With 1 worker and random reads, GPU utilization is "
|
||
f"**{_gpu_util:.0f}%** — matching the reported 12%. "
|
||
"The Lustre cluster is at roughly 1.5% of its capacity. The bottleneck is not "
|
||
"aggregate bandwidth — it is the *access pattern*. Each training sample requires "
|
||
"an independent file open, seek, and read. At 128 GPUs × 1 worker each, this "
|
||
"generates 128 independent random I/O streams hitting the Lustre namespace. "
|
||
"Each random 4K read delivers ~7% of the sequential bandwidth the hardware can provide. "
|
||
"The fix is **not more hardware** — it is changing the access pattern to sequential "
|
||
"reads (e.g., WebDataset tar shards) and increasing workers to 4–8 per GPU."
|
||
), kind="success")
|
||
elif act1_prediction.value == "A":
|
||
mo.callout(mo.md(
|
||
f"**Not quite. You predicted: {_pred_text}.**\n\n"
|
||
f"The numbers refute this immediately: 128 GPUs need only ~10.24 GB/s total, "
|
||
"while Lustre provides 672 GB/s — a 65× margin. Saturation would require the "
|
||
"actual demand to approach the cluster capacity. The cluster is at ~1.5% utilization. "
|
||
"The root cause is the *read pattern*: random 4K reads from individual files deliver "
|
||
"only ~7% of the sequential bandwidth. Increase num_workers and switch to a "
|
||
"sequential prefetch pattern to see utilization climb. "
|
||
"**Correct answer: B — access pattern mismatch.**"
|
||
), kind="warn")
|
||
elif act1_prediction.value == "C":
|
||
mo.callout(mo.md(
|
||
f"**Not quite. You predicted: {_pred_text}.**\n\n"
|
||
"InfiniBand HDR-200 provides 25 GB/s per port, and a 24-node Lustre cluster with "
|
||
"one IB port per node has 600 GB/s of aggregate network fabric — more than enough "
|
||
"for 10.24 GB/s of actual demand. The OST-to-compute network is not saturated. "
|
||
"The bottleneck is within the storage access pattern itself: random small reads "
|
||
"from many independent file handles serialize I/O that Lustre's hardware could "
|
||
"service as fast sequential streams. "
|
||
"**Correct answer: B — access pattern mismatch.**"
|
||
), kind="warn")
|
||
else:
|
||
mo.callout(mo.md(
|
||
f"**Not quite. You predicted: {_pred_text}.**\n\n"
|
||
"12% GPU utilization is definitively not normal. A properly configured data "
|
||
"pipeline should achieve 75–90% GPU utilization during training. The simulation "
|
||
"shows that switching to 4 workers with a prefetch pipeline raises utilization "
|
||
"above 75% with the same storage hardware. This is a configuration failure, "
|
||
"not a hardware limitation. "
|
||
"**Correct answer: B — access pattern mismatch.**"
|
||
), kind="warn")
|
||
return
|
||
|
||
|
||
# ─── ACT I: REFLECTION ───────────────────────────────────────────────────────
|
||
@app.cell(hide_code=True)
|
||
def _(mo):
|
||
mo.md("""
|
||
#### Reflection
|
||
|
||
You have seen how access pattern dominates throughput on distributed filesystems.
|
||
Now identify the correct architectural response.
|
||
""")
|
||
return
|
||
|
||
|
||
@app.cell(hide_code=True)
|
||
def _(mo):
|
||
act1_reflection = mo.ui.radio(
|
||
options={
|
||
"A) Purchase faster NVMe SSDs for each Lustre OST node": "A",
|
||
"B) Pre-shuffle and pack data into large sequential shards (WebDataset / TFDS format)": "B",
|
||
"C) Add more DataLoader processes to increase RAM cache utilization": "C",
|
||
"D) Replace the filesystem with a database to enable faster indexed lookups": "D",
|
||
},
|
||
label="What is the primary fix for small random read performance on a distributed filesystem?",
|
||
)
|
||
act1_reflection
|
||
return (act1_reflection,)
|
||
|
||
|
||
@app.cell(hide_code=True)
|
||
def _(mo, act1_reflection):
|
||
mo.stop(
|
||
act1_reflection.value is None,
|
||
mo.callout(mo.md("Select your answer to continue to Act II."), kind="warn"),
|
||
)
|
||
|
||
if act1_reflection.value == "B":
|
||
mo.callout(mo.md(
|
||
"**Correct.** Packing data into large sequential shards (WebDataset `.tar` archives, "
|
||
"TensorFlow Record files, Parquet) converts random access into sequential streaming. "
|
||
"Lustre's aggregate bandwidth — 672 GB/s — is genuinely available to sequential reads. "
|
||
"The hardware was never the problem; the access pattern was. "
|
||
"Sequential shards allow a single worker per GPU to sustain near-peak throughput "
|
||
"by reading one large file contiguously rather than opening thousands of small files. "
|
||
"This is the standard production fix for I/O-bound distributed training pipelines, "
|
||
"documented in @sec-data-storage."
|
||
), kind="success")
|
||
elif act1_reflection.value == "A":
|
||
mo.callout(mo.md(
|
||
"**Incorrect.** The NVMe SSDs are not the bottleneck. Each OST node already provides "
|
||
"28 GB/s (4 × 7 GB/s NVMe) — far more than the ~80 MB/s per GPU the workload demands. "
|
||
"Faster SSDs would give you faster hardware with the same utilization ceiling. "
|
||
"The constraint is software: the access pattern generates random I/O that cannot "
|
||
"benefit from higher sequential bandwidth. The correct fix is B: sequential shards."
|
||
), kind="warn")
|
||
elif act1_reflection.value == "C":
|
||
mo.callout(mo.md(
|
||
"**Incorrect.** Adding more DataLoader processes increases the *number* of concurrent "
|
||
"I/O requests, but each request is still a small random read from a different file. "
|
||
"More processes multiply the IOPS demand, which can worsen metadata server load on "
|
||
"Lustre without improving actual throughput. The root cause — fragmented random access "
|
||
"pattern — is not addressed by adding more workers. The correct fix is B: pack data "
|
||
"into sequential shards so each worker reads large contiguous chunks."
|
||
), kind="warn")
|
||
else:
|
||
mo.callout(mo.md(
|
||
"**Incorrect.** A database adds indexing overhead, transactions, and write-ahead "
|
||
"logging — all of which increase latency and reduce throughput for the purely "
|
||
"sequential read pattern that training requires. Databases are optimized for "
|
||
"transactional workloads with random read/write patterns. ML training requires "
|
||
"high-throughput sequential streaming, which a well-configured filesystem (with "
|
||
"sequential shards) handles better than any database engine. The correct fix is B."
|
||
), kind="warn")
|
||
return
|
||
|
||
|
||
# ─── ACT I: MATHPEEK ─────────────────────────────────────────────────────────
|
||
@app.cell(hide_code=True)
|
||
def _(mo):
|
||
mo.accordion({
|
||
"The governing equations — I/O Throughput and Prefetch Depth": mo.md("""
|
||
**I/O Throughput Model** (from @sec-data-storage):
|
||
|
||
$$\\text{Effective BW} = BW_{storage} \\times f_{pattern} \\times f_{workers} \\times f_{network}$$
|
||
|
||
Where:
|
||
- **BW_storage** — peak sequential storage bandwidth (OSTs × 28 GB/s for Lustre)
|
||
- **f_pattern** — read pattern efficiency: sequential ≈ 0.85, random ≈ 0.07, prefetch ≈ 0.78
|
||
- **f_workers** — worker scaling factor (near-linear 1–4, diminishing returns beyond 8)
|
||
- **f_network** — IB fabric utilization fraction (usually close to 1.0 if not saturated)
|
||
|
||
**GPU Utilization:**
|
||
|
||
$$\\eta_{GPU} = \\frac{t_{compute}}{t_{compute} + t_{IO\\ wait}}$$
|
||
|
||
With prefetch pipelining:
|
||
|
||
$$t_{IO\\ wait} = \\max\\left(0,\\ t_{IO} - t_{compute} \\times d_{prefetch}\\right)$$
|
||
|
||
Where $d_{prefetch}$ is the number of batches pre-loaded asynchronously.
|
||
|
||
**Random vs. Sequential Bandwidth Gap:**
|
||
```
|
||
Sequential read (1 worker, Lustre): 672 GB/s × 0.85 = 571 GB/s aggregate
|
||
Random 4K read (1 worker, Lustre): 672 GB/s × 0.07 = 47 GB/s aggregate
|
||
Per-GPU share (128 GPUs): 571 / 128 = 4.46 GB/s vs 47 / 128 = 0.37 GB/s
|
||
|
||
Batch load time (64 samples × 4 MB = 256 MB):
|
||
Sequential: 256 MB / 4,460 MB/s ≈ 0.057 ms → GPU util ≈ 99%
|
||
Random: 256 MB / 370 MB/s ≈ 0.69 ms → GPU util ≈ 7% (matches ~12% observed)
|
||
```
|
||
|
||
**WebDataset Sequential Read Benchmark:**
|
||
- Individual JPEG files, random access: ~70 MB/s per node
|
||
- WebDataset `.tar` shards, sequential: ~2.8 GB/s per node (40× improvement)
|
||
- Source: NVIDIA DGX documentation, @sec-data-storage
|
||
"""),
|
||
})
|
||
return
|
||
|
||
|
||
# ═════════════════════════════════════════════════════════════════════════════
|
||
# ACT II: DISTRIBUTED STORAGE ARCHITECTURE
|
||
# ═════════════════════════════════════════════════════════════════════════════
|
||
|
||
@app.cell(hide_code=True)
|
||
def _(mo, COLORS):
|
||
_c = COLORS["Cloud"]
|
||
mo.Html(f"""
|
||
<div style="margin: 40px 0 8px 0; border-top: 2px solid #e2e8f0; padding-top: 32px;">
|
||
<div style="font-size: 0.72rem; font-weight: 700; letter-spacing: 0.14em;
|
||
text-transform: uppercase; color: {_c}; margin-bottom: 6px;">
|
||
Act II · 20–25 minutes
|
||
</div>
|
||
<div style="font-size: 1.6rem; font-weight: 800; color: #0f172a; line-height: 1.2;">
|
||
Distributed Storage Architecture
|
||
</div>
|
||
<div style="font-size: 0.95rem; color: #475569; margin-top: 6px; max-width: 700px;">
|
||
You have fixed the training pipeline. Now a harder problem: design the storage
|
||
architecture for a 4,096-GPU cluster. The bandwidth math looks fine. But at this
|
||
scale, a different bottleneck emerges — one that hardware purchases cannot solve.
|
||
</div>
|
||
</div>
|
||
""")
|
||
return
|
||
|
||
|
||
# ─── ACT II: STAKEHOLDER MESSAGE ─────────────────────────────────────────────
|
||
@app.cell(hide_code=True)
|
||
def _(mo, COLORS):
|
||
_c = COLORS["BlueLine"]
|
||
mo.Html(f"""
|
||
<div style="border-left: 4px solid {_c}; background: {COLORS['BlueLL']};
|
||
border-radius: 0 10px 10px 0; padding: 16px 22px; margin: 12px 0;">
|
||
<div style="font-size: 0.72rem; font-weight: 700; color: {_c};
|
||
text-transform: uppercase; letter-spacing: 0.1em; margin-bottom: 6px;">
|
||
Incoming Message · Infrastructure CTO
|
||
</div>
|
||
<div style="font-style: italic; font-size: 1.0rem; color: #1e293b; line-height: 1.65;">
|
||
"We're building a new 4,096-GPU training cluster. Current design: 48 Lustre OST
|
||
nodes with 24 TB NVMe each, an InfiniBand fabric, compute-storage separated.
|
||
Aggregate bandwidth: 48 × 28 GB/s = 1,344 GB/s. At 80 MB/s per GPU, we need
|
||
only 328 GB/s — a 4× safety margin. The storage team says the design is sound.
|
||
We need to provision enough I/O for the next 3 years of growth. What are we missing?"
|
||
</div>
|
||
</div>
|
||
""")
|
||
return
|
||
|
||
|
||
# ─── ACT II: PREDICTION LOCK ─────────────────────────────────────────────────
|
||
@app.cell(hide_code=True)
|
||
def _(mo):
|
||
mo.md("""
|
||
#### Your Prediction
|
||
|
||
*Commit to your hypothesis before using the storage architecture designer.*
|
||
The bandwidth math is correct. The safety margin exists. Something else will fail first.
|
||
""")
|
||
return
|
||
|
||
|
||
@app.cell(hide_code=True)
|
||
def _(mo):
|
||
act2_prediction = mo.ui.radio(
|
||
options={
|
||
"A) The Lustre cluster will hit bandwidth saturation immediately at 4096 GPUs": "A",
|
||
"B) At 4096 GPUs with 8 workers each, the metadata server (MDS) becomes the bottleneck": "B",
|
||
"C) Storage will always be sufficient — just add OST nodes as needed to meet demand": "C",
|
||
"D) Compute-storage network separation causes unacceptable latency (>1 ms per read)": "D",
|
||
},
|
||
label="A 4096-GPU cluster with 1344 GB/s aggregate Lustre bandwidth. What fails first?",
|
||
)
|
||
act2_prediction
|
||
return (act2_prediction,)
|
||
|
||
|
||
@app.cell(hide_code=True)
|
||
def _(mo, act2_prediction):
|
||
mo.stop(
|
||
act2_prediction.value is None,
|
||
mo.callout(
|
||
mo.md("Select your prediction to unlock the distributed storage designer."),
|
||
kind="warn",
|
||
),
|
||
)
|
||
mo.callout(
|
||
mo.md(
|
||
f"**Prediction locked: {act2_prediction.value[:2]}** "
|
||
"Now use the architecture designer to find the failure mode. "
|
||
"Adjust GPU count, worker count, and MDS configuration to see where the ceiling hits."
|
||
),
|
||
kind="info",
|
||
)
|
||
return
|
||
|
||
|
||
# ─── ACT II: INSTRUMENTS ─────────────────────────────────────────────────────
|
||
@app.cell(hide_code=True)
|
||
def _(mo):
|
||
mo.md("""
|
||
#### The Distributed Storage Architecture Designer
|
||
|
||
Adjust the cluster parameters below to find the failure boundary.
|
||
The designer models bandwidth demand, metadata operation rate, and the MDS throughput ceiling.
|
||
""")
|
||
return
|
||
|
||
|
||
@app.cell(hide_code=True)
|
||
def _(mo):
|
||
act2_gpu_count = mo.ui.slider(
|
||
start=128, stop=4096, value=512, step=128,
|
||
label="GPU count",
|
||
show_value=True,
|
||
)
|
||
act2_workers_per_gpu = mo.ui.slider(
|
||
start=1, stop=16, value=4, step=1,
|
||
label="Data workers per GPU",
|
||
show_value=True,
|
||
)
|
||
act2_dataset_files = mo.ui.dropdown(
|
||
options={
|
||
"Sequential shards (1 file per 10K samples, ~1K total files)": "shards",
|
||
"Individual files (1 file per sample, ~100M total files)": "individual",
|
||
"Object store (no POSIX, content-addressed)": "object_store",
|
||
},
|
||
value="Individual files (1 file per sample, ~100M total files)",
|
||
label="Dataset organization",
|
||
)
|
||
act2_mds_count = mo.ui.slider(
|
||
start=1, stop=8, value=1, step=1,
|
||
label="Metadata server (MDS) count",
|
||
show_value=True,
|
||
)
|
||
mo.hstack([
|
||
act2_gpu_count,
|
||
act2_workers_per_gpu,
|
||
act2_dataset_files,
|
||
act2_mds_count,
|
||
], gap=2, justify="start")
|
||
return (act2_gpu_count, act2_workers_per_gpu, act2_dataset_files, act2_mds_count)
|
||
|
||
|
||
@app.cell(hide_code=True)
|
||
def _(
|
||
mo, act2_gpu_count, act2_workers_per_gpu, act2_dataset_files, act2_mds_count,
|
||
context_toggle, go, apply_plotly_theme, COLORS,
|
||
):
|
||
# ── Hardware constants ─────────────────────────────────────────────────────
|
||
# Source: @sec-data-storage, @sec-network-fabrics
|
||
_H100_BW_GBS_II = 3350.0 # H100 HBM bandwidth, NVIDIA spec
|
||
_NVME_SEQ_BW_II = 7.0 # NVMe Gen4 sequential read bandwidth
|
||
_LUSTRE_OST_BW_II = 28.0 # Per OST node: 4 × 7 GB/s NVMe
|
||
_OST_COUNT = 48 # Fixed per the design spec
|
||
_TOTAL_LUSTRE_BW = _OST_COUNT * _LUSTRE_OST_BW_II # 1344 GB/s aggregate
|
||
|
||
# ── Metadata server capacity model ────────────────────────────────────────
|
||
# Lustre MDS handles: open, stat, close, readdir, rename, create, unlink
|
||
# Each file access typically generates 2–4 metadata ops (open + read + close + stat)
|
||
# Source: Lustre filesystem documentation, @sec-data-storage
|
||
_MDS_OPS_PER_NODE = 50_000 # ops/sec per MDS node (Lustre typical, enterprise NVMe MDS)
|
||
_METADATA_OPS_PER_FILE_ACCESS = 3.0 # open + close + stat per file read
|
||
|
||
# ── Dataset organization factor ────────────────────────────────────────────
|
||
# Sequential shards: one large tar file opened per worker per epoch → very few metadata ops
|
||
# Individual files: one file open per sample → maximum metadata pressure
|
||
# Object store: no POSIX metadata server — content-addressed, no namespace overhead
|
||
_dataset_type = act2_dataset_files.value
|
||
if _dataset_type == "shards":
|
||
# Each worker opens ~1 shard file at a time, rotates through epoch
|
||
# Metadata ops = workers × files_per_epoch ÷ samples_per_file
|
||
_files_open_per_sec_per_worker = 0.2 # ~5 seconds per shard
|
||
_metadata_pressure_factor = 0.02 # very low — large sequential files
|
||
elif _dataset_type == "individual":
|
||
# Each worker opens one file per sample at ~20ms training steps
|
||
# Metadata ops = workers × (1 / step_time) × metadata_ops_per_file
|
||
_files_open_per_sec_per_worker = 50.0 # 50 files/sec at ~20ms/sample
|
||
_metadata_pressure_factor = 1.0 # maximum pressure
|
||
else:
|
||
# Object store: no POSIX MDS. Operations go directly to object server.
|
||
_files_open_per_sec_per_worker = 50.0 # same access rate
|
||
_metadata_pressure_factor = 0.0 # object store has no MDS bottleneck
|
||
|
||
# ── Total metadata operations per second ──────────────────────────────────
|
||
_total_workers = act2_gpu_count.value * act2_workers_per_gpu.value
|
||
_ops_per_sec_demand = (
|
||
_total_workers
|
||
* _files_open_per_sec_per_worker
|
||
* _METADATA_OPS_PER_FILE_ACCESS
|
||
* _metadata_pressure_factor
|
||
)
|
||
|
||
# ── MDS capacity and saturation ────────────────────────────────────────────
|
||
_mds_capacity = act2_mds_count.value * _MDS_OPS_PER_NODE
|
||
_mds_saturated = (_dataset_type != "object_store") and (_ops_per_sec_demand > _mds_capacity)
|
||
_ops_k = _ops_per_sec_demand / 1000
|
||
_mds_capacity_k = _mds_capacity / 1000
|
||
_mds_util_pct = min((_ops_per_sec_demand / _mds_capacity) * 100.0, 200.0) if _mds_capacity > 0 else 0.0
|
||
|
||
# ── Bandwidth demand vs supply ─────────────────────────────────────────────
|
||
# Actual bandwidth demand: 80 MB/s per GPU (realistic per @sec-data-storage)
|
||
_bw_demand_gbs = act2_gpu_count.value * 0.08 # 80 MB/s = 0.08 GB/s per GPU
|
||
|
||
# Context factor: NVMe context has different supply
|
||
_ctx_ii = context_toggle.value
|
||
if _ctx_ii == "nvme":
|
||
_bw_supply_gbs = 28.0 # single node local NVMe
|
||
else:
|
||
_bw_supply_gbs = _TOTAL_LUSTRE_BW # 1344 GB/s aggregate Lustre
|
||
|
||
_bw_headroom_pct = ((_bw_supply_gbs - _bw_demand_gbs) / _bw_supply_gbs) * 100.0
|
||
_bw_saturated = _bw_demand_gbs > _bw_supply_gbs
|
||
|
||
# ── Needed MDS nodes to handle demand ─────────────────────────────────────
|
||
_needed_mds = max(1, int(math.ceil(_ops_per_sec_demand / _MDS_OPS_PER_NODE))) if _dataset_type != "object_store" else 1
|
||
|
||
# ── Architecture recommendation ────────────────────────────────────────────
|
||
if _dataset_type == "object_store":
|
||
_recommendation = "Object store — no MDS. Scales to any GPU count."
|
||
_rec_color = COLORS["GreenLine"]
|
||
elif _mds_saturated:
|
||
_recommendation = f"Add {_needed_mds - act2_mds_count.value} more MDS nodes or migrate to object store."
|
||
_rec_color = COLORS["RedLine"]
|
||
elif _mds_util_pct > 60:
|
||
_recommendation = "MDS approaching capacity. Switch to sequential shards or pre-provision additional MDS."
|
||
_rec_color = COLORS["OrangeLine"]
|
||
else:
|
||
_recommendation = "Architecture is sound at current scale. Monitor MDS utilization as GPU count grows."
|
||
_rec_color = COLORS["GreenLine"]
|
||
|
||
# ── Chart: demand vs supply (bandwidth and metadata) ─────────────────────
|
||
_gpu_range = list(range(128, 4097, 128))
|
||
_bw_demand_curve = [g * 0.08 for g in _gpu_range]
|
||
_ops_demand_curve = [
|
||
(g * act2_workers_per_gpu.value * _files_open_per_sec_per_worker
|
||
* _METADATA_OPS_PER_FILE_ACCESS * _metadata_pressure_factor) / 1000
|
||
for g in _gpu_range
|
||
]
|
||
|
||
_fig2 = go.Figure()
|
||
|
||
# Bandwidth demand vs supply
|
||
_fig2.add_trace(go.Scatter(
|
||
x=_gpu_range, y=_bw_demand_curve,
|
||
mode="lines", name="Bandwidth Demand (GB/s)",
|
||
line=dict(color=COLORS["BlueLine"], width=2),
|
||
yaxis="y1",
|
||
))
|
||
_fig2.add_hline(
|
||
y=_bw_supply_gbs, line_dash="dash", line_color=COLORS["GreenLine"],
|
||
annotation_text=f"Storage BW Supply: {_bw_supply_gbs:.0f} GB/s",
|
||
annotation_position="top left", yref="y1",
|
||
)
|
||
|
||
# Metadata ops demand vs MDS capacity (secondary y axis)
|
||
_fig2.add_trace(go.Scatter(
|
||
x=_gpu_range, y=_ops_demand_curve,
|
||
mode="lines", name="Metadata Ops Demand (K ops/sec)",
|
||
line=dict(color=COLORS["OrangeLine"], width=2, dash="dot"),
|
||
yaxis="y2",
|
||
))
|
||
_fig2.add_hline(
|
||
y=_mds_capacity_k, line_dash="dash", line_color=COLORS["RedLine"],
|
||
annotation_text=f"MDS Capacity: {_mds_capacity_k:.0f}K ops/sec ({act2_mds_count.value} MDS)",
|
||
annotation_position="top right", yref="y2",
|
||
)
|
||
|
||
# Mark current GPU count
|
||
_fig2.add_vline(
|
||
x=act2_gpu_count.value, line_dash="solid", line_color="#6366f1",
|
||
annotation_text=f"{act2_gpu_count.value} GPUs",
|
||
annotation_position="top",
|
||
)
|
||
|
||
_fig2.update_layout(
|
||
height=340,
|
||
xaxis_title="GPU Count",
|
||
yaxis=dict(title="Bandwidth (GB/s)", side="left", gridcolor="#f1f5f9"),
|
||
yaxis2=dict(title="Metadata Ops (K ops/sec)", side="right", overlaying="y", gridcolor="#f1f5f9"),
|
||
showlegend=True,
|
||
legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="left", x=0),
|
||
margin=dict(l=60, r=60, t=60, b=50),
|
||
title=dict(
|
||
text=f"Bandwidth and Metadata Demand vs. Cluster Scale ({act2_dataset_files.value[:20]}...)",
|
||
font=dict(size=13),
|
||
),
|
||
)
|
||
apply_plotly_theme(_fig2)
|
||
|
||
# ── Metric cards ──────────────────────────────────────────────────────────
|
||
_mds_color = COLORS["RedLine"] if _mds_saturated else COLORS["OrangeLine"] if _mds_util_pct > 60 else COLORS["GreenLine"]
|
||
_bw_color = COLORS["RedLine"] if _bw_saturated else COLORS["GreenLine"]
|
||
|
||
_cards2_html = f"""
|
||
<div style="display: flex; gap: 16px; flex-wrap: wrap; margin: 16px 0;">
|
||
<div style="padding: 18px 22px; border: 1px solid #e2e8f0; border-radius: 12px;
|
||
min-width: 170px; text-align: center; background: white;
|
||
border-top: 4px solid {_bw_color};">
|
||
<div style="color: #94a3b8; font-size: 0.82rem; font-weight: 600; margin-bottom: 6px;">
|
||
Bandwidth Demand
|
||
</div>
|
||
<div style="font-size: 1.9rem; font-weight: 800; color: {_bw_color}; line-height: 1;">
|
||
{_bw_demand_gbs:.1f}
|
||
</div>
|
||
<div style="color: #94a3b8; font-size: 0.78rem; margin-top: 4px;">
|
||
GB/s of {_bw_supply_gbs:.0f} GB/s supply
|
||
</div>
|
||
</div>
|
||
<div style="padding: 18px 22px; border: 1px solid #e2e8f0; border-radius: 12px;
|
||
min-width: 170px; text-align: center; background: white;
|
||
border-top: 4px solid {_mds_color};">
|
||
<div style="color: #94a3b8; font-size: 0.82rem; font-weight: 600; margin-bottom: 6px;">
|
||
Metadata Demand
|
||
</div>
|
||
<div style="font-size: 1.9rem; font-weight: 800; color: {_mds_color}; line-height: 1;">
|
||
{_ops_k:.0f}K
|
||
</div>
|
||
<div style="color: #94a3b8; font-size: 0.78rem; margin-top: 4px;">
|
||
ops/sec of {_mds_capacity_k:.0f}K capacity
|
||
</div>
|
||
</div>
|
||
<div style="padding: 18px 22px; border: 1px solid #e2e8f0; border-radius: 12px;
|
||
min-width: 170px; text-align: center; background: white;
|
||
border-top: 4px solid {_mds_color};">
|
||
<div style="color: #94a3b8; font-size: 0.82rem; font-weight: 600; margin-bottom: 6px;">
|
||
MDS Utilization
|
||
</div>
|
||
<div style="font-size: 1.9rem; font-weight: 800; color: {_mds_color}; line-height: 1;">
|
||
{min(_mds_util_pct, 199):.0f}%
|
||
</div>
|
||
<div style="color: #94a3b8; font-size: 0.78rem; margin-top: 4px;">
|
||
{act2_mds_count.value} MDS node(s)
|
||
</div>
|
||
</div>
|
||
<div style="padding: 18px 22px; border: 1px solid #e2e8f0; border-radius: 12px;
|
||
min-width: 170px; text-align: center; background: white;
|
||
border-top: 4px solid {_rec_color};">
|
||
<div style="color: #94a3b8; font-size: 0.82rem; font-weight: 600; margin-bottom: 6px;">
|
||
Concurrent Connections
|
||
</div>
|
||
<div style="font-size: 1.9rem; font-weight: 800; color: {COLORS['BlueLine']}; line-height: 1;">
|
||
{_total_workers:,}
|
||
</div>
|
||
<div style="color: #94a3b8; font-size: 0.78rem; margin-top: 4px;">
|
||
{act2_gpu_count.value} GPUs × {act2_workers_per_gpu.value} workers
|
||
</div>
|
||
</div>
|
||
</div>
|
||
"""
|
||
|
||
# ── Physics formula ────────────────────────────────────────────────────────
|
||
_formula2_md = f"""
|
||
**The physics (from @sec-data-storage):**
|
||
|
||
```
|
||
Metadata ops/sec = GPU_count × workers_per_gpu × files_per_sec_per_worker
|
||
× metadata_ops_per_file × pressure_factor
|
||
= {act2_gpu_count.value} × {act2_workers_per_gpu.value} × {_files_open_per_sec_per_worker:.1f}
|
||
× {_METADATA_OPS_PER_FILE_ACCESS:.0f} × {_metadata_pressure_factor:.2f}
|
||
= {_ops_per_sec_demand:,.0f} ops/sec = {_ops_k:.0f}K ops/sec
|
||
|
||
MDS capacity = MDS_nodes × ops_per_MDS_node
|
||
= {act2_mds_count.value} × {_MDS_OPS_PER_NODE:,}
|
||
= {_mds_capacity:,} ops/sec = {_mds_capacity_k:.0f}K ops/sec
|
||
|
||
MDS utilization = demand / capacity = {_ops_k:.0f}K / {_mds_capacity_k:.0f}K = {_mds_util_pct:.0f}%
|
||
|
||
Bandwidth demand = GPU_count × 80 MB/s = {act2_gpu_count.value} × 0.08 GB/s = {_bw_demand_gbs:.1f} GB/s
|
||
Bandwidth headroom= {_bw_headroom_pct:.0f}% of {_bw_supply_gbs:.0f} GB/s supply
|
||
```
|
||
"""
|
||
|
||
# ── Architecture recommendation panel ────────────────────────────────────
|
||
_rec_html = f"""
|
||
<div style="background: #f8fafc; border: 1.5px solid {_rec_color}; border-radius: 12px;
|
||
padding: 18px 22px; margin: 12px 0;">
|
||
<div style="font-size: 0.82rem; font-weight: 700; color: #475569; text-transform: uppercase;
|
||
letter-spacing: 0.08em; margin-bottom: 8px;">
|
||
Architecture Recommendation
|
||
</div>
|
||
<div style="font-size: 1.0rem; font-weight: 600; color: {_rec_color}; line-height: 1.6;">
|
||
{_recommendation}
|
||
</div>
|
||
</div>
|
||
"""
|
||
|
||
mo.vstack([
|
||
mo.Html(_cards2_html),
|
||
mo.as_html(_fig2),
|
||
mo.md(_formula2_md),
|
||
mo.Html(_rec_html),
|
||
])
|
||
return (
|
||
_mds_saturated,
|
||
_ops_k,
|
||
_mds_capacity_k,
|
||
_needed_mds,
|
||
_mds_util_pct,
|
||
_bw_demand_gbs,
|
||
_bw_supply_gbs,
|
||
_bw_headroom_pct,
|
||
_dataset_type,
|
||
_total_workers,
|
||
)
|
||
|
||
|
||
# ─── ACT II: FAILURE STATE ────────────────────────────────────────────────────
|
||
@app.cell(hide_code=True)
|
||
def _(
|
||
mo, _mds_saturated, _ops_k, _mds_capacity_k, _needed_mds,
|
||
act2_mds_count, _dataset_type, _total_workers, act2_gpu_count,
|
||
):
|
||
if _dataset_type == "object_store":
|
||
mo.callout(mo.md(
|
||
"**Object store bypasses the MDS bottleneck entirely.** "
|
||
"Content-addressed object stores (S3, GCS, Azure Blob) have no POSIX namespace "
|
||
"and no metadata server. Each object is accessed by a globally unique key — "
|
||
"no directory traversal, no file locking, no inode table. This is the primary "
|
||
"architectural reason object stores dominate production ML training pipelines "
|
||
"at scale. The bandwidth remains the same, but the scaling ceiling disappears. "
|
||
"Switch back to 'individual files' to see the bottleneck re-emerge."
|
||
), kind="success")
|
||
elif _mds_saturated:
|
||
mo.callout(mo.md(
|
||
f"**Metadata server saturation. This cluster cannot function at this configuration.**\n\n"
|
||
f"Demand: **{_ops_k:.0f}K ops/sec** | MDS capacity: **{_mds_capacity_k:.0f}K ops/sec** "
|
||
f"({act2_mds_count.value} MDS node{'s' if act2_mds_count.value > 1 else ''}).\n\n"
|
||
f"**{_total_workers:,} concurrent connections** ({act2_gpu_count.value} GPUs × "
|
||
f"{_total_workers // act2_gpu_count.value} workers) are all generating file-open "
|
||
"requests to the same metadata server. Lustre's MDS is a single coordination "
|
||
"point for all namespace operations — even though the bandwidth is fine, "
|
||
"every file access requires an MDS round-trip for the inode lookup. "
|
||
f"To fix: add {_needed_mds - act2_mds_count.value} more MDS nodes, switch to "
|
||
"sequential shards (reduces opens by 50×+), or migrate to object storage entirely."
|
||
), kind="danger")
|
||
elif _mds_util_pct > 60:
|
||
mo.callout(mo.md(
|
||
f"**MDS approaching saturation: {_mds_util_pct:.0f}% utilization.** "
|
||
f"At {act2_gpu_count.value} GPUs with individual files, metadata operations are consuming "
|
||
"more than half of MDS capacity. Doubling the GPU count will push this into "
|
||
"saturation before bandwidth demand becomes significant. "
|
||
"The architectural decision: switch to sequential shards now, or provision "
|
||
f"{_needed_mds} MDS nodes proactively. Bandwidth headroom is not the constraint to watch."
|
||
), kind="warn")
|
||
else:
|
||
mo.callout(mo.md(
|
||
f"**Architecture is currently feasible.** "
|
||
f"MDS utilization: {_mds_util_pct:.0f}%. Bandwidth demand: {_ops_k * 0 + 0:.0f} — "
|
||
"well within supply. Push the GPU count slider to 4,096 with individual files "
|
||
"and 8 workers to see the metadata bottleneck emerge. The bandwidth will still "
|
||
"be fine; the MDS will not."
|
||
), kind="success")
|
||
return
|
||
|
||
|
||
# ─── ACT II: PREDICTION REVEAL ───────────────────────────────────────────────
|
||
@app.cell(hide_code=True)
|
||
def _(mo, act2_prediction, _ops_k, _mds_capacity_k):
|
||
_is_correct = act2_prediction.value == "B"
|
||
# At 4096 GPUs × 8 workers × 50 opens/sec × 3 ops/file = 4,915,200 ops/sec = 4,915K ops/sec
|
||
# vs MDS capacity of 50K ops/sec (1 MDS node)
|
||
_actual_ops_demand_k = (4096 * 8 * 50.0 * 3.0) / 1000
|
||
_actual_mds_cap_k = 50.0
|
||
|
||
if _is_correct:
|
||
mo.callout(mo.md(
|
||
f"**Correct. You predicted: metadata server saturation.**\n\n"
|
||
f"At 4,096 GPUs × 8 workers = 32,768 concurrent connections, all generating "
|
||
"file-open requests against the Lustre namespace: "
|
||
f"**{_actual_ops_demand_k:,.0f}K ops/sec demand** against "
|
||
f"**{_actual_mds_cap_k:.0f}K ops/sec MDS capacity** — a "
|
||
f"{_actual_ops_demand_k/_actual_mds_cap_k:.0f}× overload. "
|
||
"Meanwhile, bandwidth demand is 4,096 × 80 MB/s = 328 GB/s — well within "
|
||
"the 1,344 GB/s supply. The bandwidth safety margin the team computed is real; "
|
||
"the metadata capacity they never measured is the actual constraint. "
|
||
"Every add-OST investment solves the wrong problem."
|
||
), kind="success")
|
||
elif act2_prediction.value == "A":
|
||
mo.callout(mo.md(
|
||
f"**Not quite. You predicted: bandwidth saturation.**\n\n"
|
||
"4,096 GPUs × 80 MB/s = 328 GB/s demand against 1,344 GB/s supply — a 4× safety margin. "
|
||
"The bandwidth math the CTO computed is correct. But the metadata server running "
|
||
"at 50K ops/sec capacity against 4,915K ops/sec demand (a 98× overload) fails "
|
||
"first and catastrophically. Adding more OST nodes improves bandwidth capacity "
|
||
"but does nothing for the MDS bottleneck. "
|
||
"**Correct answer: B — metadata server becomes the bottleneck.**"
|
||
), kind="warn")
|
||
elif act2_prediction.value == "C":
|
||
mo.callout(mo.md(
|
||
f"**Not quite. You predicted: storage scales linearly with OST nodes.**\n\n"
|
||
"For *bandwidth*, this is correct — adding OSTs increases aggregate throughput. "
|
||
"But the Lustre Metadata Server (MDS) is a separate component that manages the "
|
||
"filesystem namespace. It does not scale with OST node count. A single MDS "
|
||
"serves all namespace operations for the entire cluster, and its throughput "
|
||
"ceiling is a hard constraint no OST addition can move. "
|
||
"**Correct answer: B — metadata server becomes the bottleneck.**"
|
||
), kind="warn")
|
||
else:
|
||
mo.callout(mo.md(
|
||
f"**Not quite. You predicted: network latency is the bottleneck.**\n\n"
|
||
"InfiniBand HDR-200 introduces ~1–5 microseconds of fabric latency — "
|
||
"far below the millisecond-scale batch processing window. At 128 GB/s per "
|
||
"compute node IB port, the network fabric is not the constraint. "
|
||
"The Lustre metadata server, handling every file-open and close operation "
|
||
"for 32,768 concurrent workers, saturates at 50K ops/sec against a demand "
|
||
"of 4,915K ops/sec. Network latency is measurable but negligible; "
|
||
"metadata throughput is the hard ceiling. "
|
||
"**Correct answer: B — metadata server becomes the bottleneck.**"
|
||
), kind="warn")
|
||
return
|
||
|
||
|
||
# ─── ACT II: REFLECTION ──────────────────────────────────────────────────────
|
||
@app.cell(hide_code=True)
|
||
def _(mo):
|
||
mo.md("""
|
||
#### Reflection
|
||
|
||
You have seen metadata saturation emerge at scale where bandwidth was never the issue.
|
||
Now identify the architectural principle this demonstrates.
|
||
""")
|
||
return
|
||
|
||
|
||
@app.cell(hide_code=True)
|
||
def _(mo):
|
||
act2_reflection = mo.ui.radio(
|
||
options={
|
||
"A) Object stores are faster for sequential reads than POSIX filesystems": "A",
|
||
"B) Object stores eliminate the metadata bottleneck — objects are addressed by content hash, removing directory traversal": "B",
|
||
"C) POSIX filesystems cannot store files larger than 4 GB": "C",
|
||
"D) Object stores support more concurrent connections per storage node than POSIX": "D",
|
||
},
|
||
label="Why is object storage (S3/GCS) often preferred over POSIX filesystems for training data at scale?",
|
||
)
|
||
act2_reflection
|
||
return (act2_reflection,)
|
||
|
||
|
||
@app.cell(hide_code=True)
|
||
def _(mo, act2_reflection):
|
||
mo.stop(
|
||
act2_reflection.value is None,
|
||
mo.callout(mo.md("Select your answer to complete Act II."), kind="warn"),
|
||
)
|
||
|
||
if act2_reflection.value == "B":
|
||
mo.callout(mo.md(
|
||
"**Correct.** Object stores (S3, GCS, Azure Blob Storage) address objects by "
|
||
"globally unique keys — no POSIX namespace, no inode table, no directory hierarchy. "
|
||
"There is no metadata server to saturate. A GET request goes directly to the "
|
||
"object's physical location, resolved through a distributed key-value index that "
|
||
"scales horizontally without a coordination bottleneck. "
|
||
"This explains why production training pipelines at Google, Meta, and Microsoft "
|
||
"all use object storage as the primary data tier — not because sequential bandwidth "
|
||
"is higher (it is similar), but because the namespace scaling barrier does not exist. "
|
||
"The data gravity consequence: if training data lives in S3, compute must also run "
|
||
"in the same cloud region. Moving 100 TB to a local Lustre cluster costs the same "
|
||
"as running the training job where the data already lives."
|
||
), kind="success")
|
||
elif act2_reflection.value == "A":
|
||
mo.callout(mo.md(
|
||
"**Incorrect.** Sequential read bandwidth for S3 and a well-configured Lustre "
|
||
"cluster are similar for large files — both can sustain hundreds of GB/s in aggregate. "
|
||
"The advantage of object stores is not in raw sequential throughput; it is in the "
|
||
"absence of a namespace coordination bottleneck. A POSIX filesystem routes all "
|
||
"namespace operations through one or a few MDS nodes. An object store does not "
|
||
"have a metadata server at all. **Correct answer: B.**"
|
||
), kind="warn")
|
||
elif act2_reflection.value == "C":
|
||
mo.callout(mo.md(
|
||
"**Incorrect.** Modern POSIX filesystems (Lustre, GPFS, ext4, XFS) support files "
|
||
"measured in petabytes — there is no practical size limit for training workloads. "
|
||
"The limitation of POSIX at ML scale is metadata coordination overhead, not file "
|
||
"size. **Correct answer: B — the metadata bottleneck is the architectural constraint.**"
|
||
), kind="warn")
|
||
else:
|
||
mo.callout(mo.md(
|
||
"**Incorrect.** POSIX filesystems with appropriate tuning (Lustre with many OST "
|
||
"nodes, GPFS with distributed metadata) can handle tens of thousands of concurrent "
|
||
"connections. The issue is not raw concurrency capacity — it is that every connection "
|
||
"must serialize through the MDS for namespace operations. Object stores route each "
|
||
"request directly to the storage backend without a centralized coordinator. "
|
||
"**Correct answer: B — content-addressed objects eliminate namespace bottlenecks.**"
|
||
), kind="warn")
|
||
return
|
||
|
||
|
||
# ─── ACT II: MATHPEEK ────────────────────────────────────────────────────────
|
||
@app.cell(hide_code=True)
|
||
def _(mo):
|
||
mo.accordion({
|
||
"The governing equations — Metadata Rate and Data Gravity at Scale": mo.md("""
|
||
**Metadata Operation Rate** (from @sec-data-storage):
|
||
|
||
$$\\text{MDS demand} = N_{GPU} \\times W_{GPU} \\times f_{open} \\times m_{ops}$$
|
||
|
||
Where:
|
||
- **N_GPU** — number of GPU accelerators
|
||
- **W_GPU** — data loader workers per GPU
|
||
- **f_open** — file opens per second per worker (varies by dataset organization)
|
||
- **m_ops** — metadata operations per file access (open + stat + close ≈ 3)
|
||
|
||
**Example: 4096 GPUs, 8 workers, individual files:**
|
||
```
|
||
MDS demand = 4,096 × 8 × 50 ops/sec × 3
|
||
= 4,915,200 ops/sec ≈ 4,915K ops/sec
|
||
MDS supply = 1 node × 50K ops/sec = 50K ops/sec
|
||
Saturation = 4,915 / 50 = 98× overload → training stalls
|
||
```
|
||
|
||
**Sequential shards fix (same GPUs):**
|
||
```
|
||
f_open ≈ 0.2 opens/sec per worker (5s per shard)
|
||
MDS demand = 4,096 × 8 × 0.2 × 3 = 19,661 ops/sec ≈ 20K ops/sec
|
||
MDS supply = 1 node × 50K ops/sec = 50K ops/sec
|
||
Utilization = 40% ← within bounds
|
||
```
|
||
|
||
**Object Store vs POSIX Comparison:**
|
||
| Property | Lustre (POSIX) | S3 / GCS (Object) |
|
||
|----------|:-------------:|:-----------------:|
|
||
| Metadata server | Required (MDS) | None |
|
||
| Namespace scalability | MDS-bound | Distributed hash |
|
||
| Sequential BW | 672 GB/s (48 OSTs) | ~same per region |
|
||
| Random access IOPS | NVMe-bound | Tiered by class |
|
||
| Training suitability | Good (with shards) | Excellent (any pattern) |
|
||
|
||
**Data Gravity Implication:**
|
||
Once training data is in S3 (us-east-1), moving 100 TB to a local Lustre cluster:
|
||
```
|
||
T_transfer = 100,000 GB / 12.5 GB/s (100 Gbps) = 8,000 s ≈ 2.2 hours
|
||
Cost = 100,000 GB × $0.09/GB (S3 egress) = $9,000
|
||
|
||
Compare to running the training job in us-east-1 for 24 hours:
|
||
Cost = 512 × H100 spot × $3.50/GPU-hr × 24h = $43,008
|
||
|
||
Data gravity decision: move compute to data if transfer cost + latency
|
||
exceeds the cost of provisioning compute near the data.
|
||
```
|
||
"""),
|
||
})
|
||
return
|
||
|
||
|
||
# ─── DESIGN LEDGER SAVE + HUD ─────────────────────────────────────────────────
|
||
@app.cell(hide_code=True)
|
||
def _(
|
||
mo, ledger, context_toggle,
|
||
act1_prediction, act1_reflection,
|
||
act2_prediction, act2_reflection,
|
||
act2_gpu_count, act2_workers_per_gpu,
|
||
act1_num_workers,
|
||
_gpu_util, _effective_storage_bw,
|
||
_mds_saturated, _ops_k, _dataset_type, _total_workers,
|
||
COLORS,
|
||
):
|
||
_act1_correct = (act1_prediction.value == "B") if act1_prediction.value else False
|
||
_act1_refl_correct = (act1_reflection.value == "B") if act1_reflection.value else False
|
||
_act2_correct = (act2_prediction.value == "B") if act2_prediction.value else False
|
||
_act2_refl_correct = (act2_reflection.value == "B") if act2_reflection.value else False
|
||
|
||
# Map dataset type to a clean label for the ledger
|
||
_storage_type_map = {
|
||
"shards": "sequential_shards",
|
||
"individual": "individual_files",
|
||
"object_store": "object_store",
|
||
}
|
||
_storage_type = _storage_type_map.get(_dataset_type, "individual_files")
|
||
|
||
ledger.save(
|
||
chapter="v2_04",
|
||
design={
|
||
"context": context_toggle.value,
|
||
"storage_type": _storage_type,
|
||
"gpu_count": act2_gpu_count.value,
|
||
"data_workers_per_gpu": act2_workers_per_gpu.value,
|
||
"io_throughput_gbs": round(float(_effective_storage_bw), 2),
|
||
"act1_prediction": act1_prediction.value if act1_prediction.value else "none",
|
||
"act1_correct": _act1_correct,
|
||
"act2_result": round(float(_ops_k), 1), # metadata ops/sec demand in K
|
||
"act2_decision": _storage_type,
|
||
"constraint_hit": bool(_mds_saturated),
|
||
"mds_saturated": bool(_mds_saturated),
|
||
},
|
||
)
|
||
|
||
# HUD footer
|
||
_ctx_label = "NVMe — Single Node" if context_toggle.value == "nvme" else "Distributed FS — Lustre"
|
||
_act1_status = "correct" if _act1_correct else ("pending" if not act1_prediction.value else "incorrect")
|
||
_act2_status = "correct" if _act2_correct else ("pending" if not act2_prediction.value else "incorrect")
|
||
_mds_status = "saturated" if _mds_saturated else "healthy"
|
||
|
||
def _sc(s):
|
||
return {
|
||
"correct": "#4ade80", "pending": "#94a3b8", "incorrect": "#f87171",
|
||
"saturated": "#f87171", "healthy": "#4ade80",
|
||
}.get(s, "#94a3b8")
|
||
|
||
mo.Html(f"""
|
||
<div style="display: flex; gap: 24px; align-items: center; flex-wrap: wrap;
|
||
padding: 14px 24px; background: #0f172a; border-radius: 12px;
|
||
margin-top: 40px; font-family: 'SF Mono', monospace; font-size: 0.8rem;
|
||
border: 1px solid #1e293b;">
|
||
<span style="color: #94a3b8; font-weight: 600; letter-spacing: 0.06em;">
|
||
DESIGN LEDGER · V2-CH04
|
||
</span>
|
||
<span style="color: #475569;">|</span>
|
||
<span>
|
||
<span style="color: #94a3b8;">Context: </span>
|
||
<span style="color: #e2e8f0;">{_ctx_label}</span>
|
||
</span>
|
||
<span>
|
||
<span style="color: #94a3b8;">Act I prediction: </span>
|
||
<span style="color: {_sc(_act1_status)};">{_act1_status}</span>
|
||
</span>
|
||
<span>
|
||
<span style="color: #94a3b8;">Act II prediction: </span>
|
||
<span style="color: {_sc(_act2_status)};">{_act2_status}</span>
|
||
</span>
|
||
<span>
|
||
<span style="color: #94a3b8;">GPU util (Act I): </span>
|
||
<span style="color: #e2e8f0;">{_gpu_util:.0f}%</span>
|
||
</span>
|
||
<span>
|
||
<span style="color: #94a3b8;">Effective I/O BW: </span>
|
||
<span style="color: #e2e8f0;">{_effective_storage_bw:.1f} GB/s</span>
|
||
</span>
|
||
<span>
|
||
<span style="color: #94a3b8;">MDS status: </span>
|
||
<span style="color: {_sc(_mds_status)};">{_mds_status} ({_ops_k:.0f}K ops/sec)</span>
|
||
</span>
|
||
<span>
|
||
<span style="color: #94a3b8;">Workers: </span>
|
||
<span style="color: #e2e8f0;">{_total_workers:,} ({act2_gpu_count.value}×{act2_workers_per_gpu.value})</span>
|
||
</span>
|
||
</div>
|
||
""")
|
||
return
|
||
|
||
|
||
if __name__ == "__main__":
|
||
app.run()
|