mirror of
https://github.com/harvard-edge/cs249r_book.git
synced 2026-05-01 10:09:18 -05:00
Add all Vol1 (labs 01-16) and Vol2 (labs 01-17) interactive Marimo labs as the first full first-pass implementation of the ML Systems curriculum labs. Each lab follows the PROTOCOL 2-Act structure (35-40 min): - Act I: Calibration with prediction lock → instruments → overlay - Act II: Design challenge with failure states and reflection Key pedagogical instruments introduced progressively: - Vol1: D·A·M Triad, Iron Law, Memory Ledger, Roofline, Amdahl's Law, Little's Law, P99 Histogram, Compression Frontier, Chouldechova theorem - Vol2: NVLink vs PCIe cliff, Bisection BW, Young-Daly T*, Parallelism Paradox, AllReduce ring vs tree, KV-cache model, Jevons Paradox, DP ε-δ tradeoff, SLO composition, Adversarial Pareto, two-volume synthesis capstone All 35 staged files pass AST syntax verification (36/36 including lab_00). Also includes: - labs/LABS_SPEC.md: authoritative sub-agent brief for all lab conventions - labs/core/style.py: expanded unified design system with semantic color tokens
1283 lines
57 KiB
Python
1283 lines
57 KiB
Python
import marimo
|
||
|
||
__generated_with = "0.19.6"
|
||
app = marimo.App(width="full")
|
||
|
||
# ─────────────────────────────────────────────────────────────────────────────
|
||
# LAB 04: THE DATA GRAVITY TRAP
|
||
#
|
||
# Chapter: data_engineering.qmd (@sec-data-engineering)
|
||
# Core Invariant: Data gravity — large datasets attract compute to their
|
||
# location. Moving data costs more than moving compute when data volume
|
||
# exceeds network bandwidth × time budget.
|
||
#
|
||
# 2-Act structure (35-40 min total):
|
||
# Act I: The Pipeline Bottleneck (12-15 min)
|
||
# The GPU is idle 77% of the time — but the team wants more GPUs.
|
||
# Prediction lock → timeline instrument → reveal → reflection → MathPeek
|
||
# Act II: The Data Gravity Calculation (20-25 min)
|
||
# 50 TB in us-east-1, training GPUs in us-west-2. Transfer or co-locate?
|
||
# Prediction lock → gravity instruments → failure state → reflection → MathPeek
|
||
#
|
||
# Deployment contexts:
|
||
# Cloud: Multi-region (100 Gbps inter-DC link, AWS egress $0.08/GB)
|
||
# Edge: Local processing (1 Gbps LAN, zero egress cost)
|
||
#
|
||
# Traceability:
|
||
# GPU utilization formula — @sec-data-engineering-feeding-problem
|
||
# Feeding tax — FeedingProblem class in data_engineering.qmd
|
||
# Data gravity T = D/BW — DataGravity class in data_engineering.qmd
|
||
# AWS egress $0.08/GB — DataGravity.egress_cost_per_gb_str
|
||
# 100 Gbps = 12.5 GB/s — DataGravity.network_gbs_str
|
||
# HDD 0.15 GB/s — Storage tier physics, @sec-data-engineering
|
||
# SSD 0.55 GB/s — Storage tier physics, @sec-data-engineering
|
||
# NVMe 3.5 GB/s — Storage tier physics, @sec-data-engineering
|
||
# RAM 50 GB/s — DRAM bandwidth, @sec-data-engineering
|
||
#
|
||
# Design Ledger save:
|
||
# chapter=4, context, storage_type_chosen, gpu_util_at_start,
|
||
# data_gravity_triggered, act1_correct, act2_correct
|
||
# ─────────────────────────────────────────────────────────────────────────────
|
||
|
||
|
||
# ─── CELL 0: SETUP (hide_code=False — leave visible) ─────────────────────────
|
||
@app.cell
|
||
def _():
|
||
import marimo as mo
|
||
import sys
|
||
import math
|
||
from pathlib import Path
|
||
import plotly.graph_objects as go
|
||
import numpy as np
|
||
|
||
_root = Path(__file__).resolve().parents[2]
|
||
if str(_root) not in sys.path:
|
||
sys.path.insert(0, str(_root))
|
||
|
||
from labs.core.state import DesignLedger
|
||
from labs.core.style import COLORS, LAB_CSS, apply_plotly_theme
|
||
|
||
ledger = DesignLedger()
|
||
return mo, ledger, COLORS, LAB_CSS, apply_plotly_theme, go, np, math
|
||
|
||
|
||
# ─── CELL 1: HEADER ──────────────────────────────────────────────────────────
|
||
@app.cell(hide_code=True)
|
||
def _(mo, LAB_CSS, COLORS):
|
||
_c = COLORS["BlueLine"]
|
||
mo.vstack([
|
||
LAB_CSS,
|
||
mo.Html(f"""
|
||
<div style="background: linear-gradient(135deg, #0f172a 0%, #1e293b 100%);
|
||
padding: 36px 44px; border-radius: 16px; color: white;
|
||
box-shadow: 0 8px 32px rgba(0,0,0,0.3); margin-bottom: 8px;">
|
||
<div style="font-size: 0.72rem; font-weight: 700; letter-spacing: 0.18em;
|
||
color: #475569; text-transform: uppercase; margin-bottom: 10px;">
|
||
Machine Learning Systems · Volume I · Lab 04
|
||
</div>
|
||
<h1 style="margin: 0 0 10px 0; font-size: 2.2rem; font-weight: 900;
|
||
color: #f8fafc; line-height: 1.1; letter-spacing: -0.02em;">
|
||
The Data Gravity Trap
|
||
</h1>
|
||
<p style="margin: 0 0 22px 0; font-size: 1.02rem; color: #94a3b8;
|
||
max-width: 660px; line-height: 1.65;">
|
||
Your GPU is idle 77% of the time. The team wants to buy more hardware.
|
||
Before spending another dollar, you need to diagnose whether the bottleneck
|
||
is compute — or data movement.
|
||
</p>
|
||
<div style="display: flex; gap: 12px; flex-wrap: wrap; align-items: center;">
|
||
<span style="background: rgba(99,102,241,0.15); color: #a5b4fc;
|
||
padding: 5px 14px; border-radius: 20px; font-size: 0.8rem;
|
||
font-weight: 600; border: 1px solid rgba(99,102,241,0.25);">
|
||
Act I: Pipeline Bottleneck · 12–15 min
|
||
</span>
|
||
<span style="background: rgba(99,102,241,0.15); color: #a5b4fc;
|
||
padding: 5px 14px; border-radius: 20px; font-size: 0.8rem;
|
||
font-weight: 600; border: 1px solid rgba(99,102,241,0.25);">
|
||
Act II: Data Gravity · 20–25 min
|
||
</span>
|
||
<span style="background: rgba(16,185,129,0.15); color: #6ee7b7;
|
||
padding: 5px 14px; border-radius: 20px; font-size: 0.8rem;
|
||
font-weight: 600; border: 1px solid rgba(16,185,129,0.25);">
|
||
35–40 min total
|
||
</span>
|
||
<span class="badge badge-info">Chapter 4: Data Engineering</span>
|
||
</div>
|
||
</div>
|
||
"""),
|
||
])
|
||
return
|
||
|
||
|
||
# ─── CELL 2: RECOMMENDED READING ─────────────────────────────────────────────
|
||
@app.cell(hide_code=True)
|
||
def _(mo):
|
||
mo.callout(mo.md("""
|
||
**Recommended Reading** — Complete the following before this lab:
|
||
|
||
- **@sec-data-engineering-physics-data-cdcb** (Physics of Data) — Data gravity and
|
||
the energy-movement invariant: why moving a bit costs 100–1,000x more than computing on it.
|
||
- **@sec-data-engineering-feeding-problem** (The Feeding Problem) — The Feeding Tax,
|
||
GPU utilization formula, and why storage bandwidth determines training throughput.
|
||
- **@sec-data-engineering-data-gravity-adcb** (Data Gravity) — T = D/BW, the rule of
|
||
thumb for when to move compute vs. data, and the lakehouse architectural response.
|
||
"""), kind="info")
|
||
return
|
||
|
||
|
||
# ─── CELL 3: CONTEXT TOGGLE ──────────────────────────────────────────────────
|
||
@app.cell(hide_code=True)
|
||
def _(mo):
|
||
context_toggle = mo.ui.radio(
|
||
options={
|
||
"Cloud (multi-region, 100 Gbps inter-DC)": "cloud",
|
||
"Edge (local processing, 1 Gbps LAN)": "edge",
|
||
},
|
||
value="Cloud (multi-region, 100 Gbps inter-DC)",
|
||
label="Deployment context:",
|
||
inline=True,
|
||
)
|
||
mo.vstack([
|
||
mo.md("---"),
|
||
mo.md("### Select Your Deployment Context"),
|
||
mo.md(
|
||
"This choice persists across both acts. It changes the network bandwidth "
|
||
"and egress cost assumptions in the data gravity calculations."
|
||
),
|
||
context_toggle,
|
||
])
|
||
return (context_toggle,)
|
||
|
||
|
||
@app.cell(hide_code=True)
|
||
def _(mo, context_toggle, COLORS):
|
||
_ctx = context_toggle.value
|
||
if _ctx == "cloud":
|
||
_bw_desc = "100 Gbps inter-region link = 12.5 GB/s"
|
||
_cost_desc = "AWS egress: $0.08/GB"
|
||
_color = COLORS["Cloud"]
|
||
_label = "Cloud — Multi-Region"
|
||
else:
|
||
_bw_desc = "1 Gbps LAN = 0.125 GB/s"
|
||
_cost_desc = "Local network: $0.00/GB"
|
||
_color = COLORS["Edge"]
|
||
_label = "Edge — Local Processing"
|
||
|
||
mo.callout(mo.md(
|
||
f"**Context: {_label}** — Network bandwidth: {_bw_desc}. Transfer cost: {_cost_desc}. "
|
||
f"The data gravity calculation in Act II will use these parameters."
|
||
), kind="info")
|
||
return
|
||
|
||
|
||
# ═════════════════════════════════════════════════════════════════════════════
|
||
# ACT I: THE PIPELINE BOTTLENECK
|
||
# ═════════════════════════════════════════════════════════════════════════════
|
||
|
||
@app.cell(hide_code=True)
|
||
def _(mo, COLORS):
|
||
_c = COLORS["BlueLine"]
|
||
mo.Html(f"""
|
||
<div style="margin: 32px 0 8px 0;">
|
||
<div style="font-size: 0.72rem; font-weight: 700; letter-spacing: 0.14em;
|
||
text-transform: uppercase; color: {_c}; margin-bottom: 6px;">
|
||
Act I · 12–15 minutes
|
||
</div>
|
||
<div style="font-size: 1.6rem; font-weight: 800; color: #0f172a; line-height: 1.2;">
|
||
The Pipeline Bottleneck
|
||
</div>
|
||
<div style="font-size: 0.95rem; color: #475569; margin-top: 6px; max-width: 680px;">
|
||
A team of ML engineers is training a computer vision model. Their GPU utilization
|
||
has been sitting at 23% for three days. Before you touch a single slider,
|
||
you need to commit to a diagnosis.
|
||
</div>
|
||
</div>
|
||
""")
|
||
return
|
||
|
||
|
||
# ─── ACT I: STAKEHOLDER MESSAGE ──────────────────────────────────────────────
|
||
@app.cell(hide_code=True)
|
||
def _(mo, COLORS):
|
||
_c = COLORS["OrangeLine"]
|
||
mo.Html(f"""
|
||
<div style="border-left: 4px solid {_c}; background: {COLORS['OrangeLL']};
|
||
border-radius: 0 10px 10px 0; padding: 16px 22px; margin: 12px 0;">
|
||
<div style="font-size: 0.72rem; font-weight: 700; color: {_c};
|
||
text-transform: uppercase; letter-spacing: 0.1em; margin-bottom: 6px;">
|
||
Incoming Message · ML Engineering Lead
|
||
</div>
|
||
<div style="font-style: italic; font-size: 1.0rem; color: #1e293b; line-height: 1.65;">
|
||
"We've been training ResNet-50 on a 10 TB image dataset for 3 days.
|
||
GPU utilization is stuck at 23%. The model is clearly too complex for
|
||
our hardware — we need to request 4× more GPUs before Friday's deadline.
|
||
Can you sign off on the procurement request?"
|
||
</div>
|
||
</div>
|
||
""")
|
||
return
|
||
|
||
|
||
# ─── ACT I: PREDICTION LOCK ──────────────────────────────────────────────────
|
||
@app.cell(hide_code=True)
|
||
def _(mo):
|
||
mo.md("""
|
||
#### Your Prediction
|
||
|
||
*Before interacting with the simulator, commit to your diagnosis.*
|
||
GPU utilization is 23% during training. The team believes this means the GPU
|
||
cannot keep up with the model's compute demands.
|
||
""")
|
||
return
|
||
|
||
|
||
@app.cell(hide_code=True)
|
||
def _(mo):
|
||
act1_prediction = mo.ui.radio(
|
||
options={
|
||
"A) Insufficient GPU FLOPS — the model requires more compute than the GPU provides": "A",
|
||
"B) Data loading is slower than GPU computation — the GPU is starving for input": "B",
|
||
"C) The model is too small — it does not utilize the GPU's parallel units": "C",
|
||
"D) The learning rate is too high — training is unstable and wasting cycles": "D",
|
||
},
|
||
label="GPU utilization is 23% during training. The most likely bottleneck is:",
|
||
)
|
||
act1_prediction
|
||
return (act1_prediction,)
|
||
|
||
|
||
@app.cell(hide_code=True)
|
||
def _(mo, act1_prediction):
|
||
mo.stop(
|
||
act1_prediction.value is None,
|
||
mo.callout(
|
||
mo.md("Select your prediction to unlock the pipeline simulator."),
|
||
kind="warn",
|
||
),
|
||
)
|
||
mo.callout(
|
||
mo.md(
|
||
f"**Prediction locked: {act1_prediction.value[:2]}** "
|
||
"Now explore the simulator to test your hypothesis."
|
||
),
|
||
kind="info",
|
||
)
|
||
return
|
||
|
||
|
||
# ─── ACT I: INSTRUMENTS ──────────────────────────────────────────────────────
|
||
@app.cell(hide_code=True)
|
||
def _(mo):
|
||
mo.md("""
|
||
#### The Pipeline Simulator
|
||
|
||
Adjust the parameters below to see how storage type and data loading
|
||
configuration affect GPU utilization. The timeline shows where time goes
|
||
within a single training batch.
|
||
""")
|
||
return
|
||
|
||
|
||
@app.cell(hide_code=True)
|
||
def _(mo):
|
||
act1_dataset_size_tb = mo.ui.slider(
|
||
start=1, stop=100, value=10, step=1,
|
||
label="Dataset size (TB)",
|
||
show_value=True,
|
||
)
|
||
act1_storage_type = mo.ui.dropdown(
|
||
options={
|
||
"HDD (0.15 GB/s)": "hdd",
|
||
"SSD (0.55 GB/s)": "ssd",
|
||
"NVMe (3.5 GB/s)": "nvme",
|
||
"RAM disk (50 GB/s)": "ram",
|
||
},
|
||
value="HDD (0.15 GB/s)",
|
||
label="Storage type",
|
||
)
|
||
act1_num_workers = mo.ui.slider(
|
||
start=1, stop=32, value=4, step=1,
|
||
label="DataLoader workers",
|
||
show_value=True,
|
||
)
|
||
mo.hstack([
|
||
act1_dataset_size_tb,
|
||
act1_storage_type,
|
||
act1_num_workers,
|
||
], gap=2, justify="start")
|
||
return (act1_dataset_size_tb, act1_storage_type, act1_num_workers)
|
||
|
||
|
||
@app.cell(hide_code=True)
|
||
def _(mo, act1_storage_type, act1_num_workers, act1_dataset_size_tb, go, apply_plotly_theme, COLORS):
|
||
# ── Physics engine ────────────────────────────────────────────────────────
|
||
# Source: @sec-data-engineering-feeding-problem
|
||
# GPU compute time per batch (ResNet-50, batch=64, A100)
|
||
# ResNet-50: 4.1 GFLOPs per image, A100: 312 TFLOPS FP32
|
||
# t_compute = (64 * 4.1e9) / (312e12) ≈ 0.00084 s ≈ 0.84 ms
|
||
_BATCH_SIZE = 64
|
||
_RESNET50_GFLOPS_PER_IMG = 4.1 # GFLOPs
|
||
_A100_TFLOPS_FP32 = 312.0 # TFLOPS (A100 FP32 tensor, NVIDIA spec)
|
||
_IMG_SIZE_MB = (224 * 224 * 3 * 4) / (1024 * 1024) # 224x224 RGB FP32
|
||
|
||
_t_compute_s = (_BATCH_SIZE * _RESNET50_GFLOPS_PER_IMG * 1e9) / (_A100_TFLOPS_FP32 * 1e12)
|
||
|
||
# Storage bandwidth (GB/s) — source: @sec-data-engineering storage tier data
|
||
_storage_bw = {
|
||
"hdd": 0.15, # HDD sequential read, GB/s
|
||
"ssd": 0.55, # SATA SSD, GB/s
|
||
"nvme": 3.5, # NVMe PCIe 4.0, GB/s
|
||
"ram": 50.0, # DRAM, GB/s
|
||
}
|
||
_bw = _storage_bw[act1_storage_type.value]
|
||
|
||
# Effective bandwidth scales with num_workers (diminishing returns after ~8)
|
||
_worker_factor = min(act1_num_workers.value, 8) / 4.0
|
||
_effective_bw = _bw * min(_worker_factor, 2.0)
|
||
|
||
# I/O time per batch: bytes to load / effective_bandwidth
|
||
_batch_bytes_gb = _BATCH_SIZE * _IMG_SIZE_MB / 1024
|
||
_t_io_s = _batch_bytes_gb / _effective_bw
|
||
|
||
# Preprocessing: fixed 0.2× of IO time (decode, augment)
|
||
_t_preprocess_s = _t_io_s * 0.2
|
||
|
||
# Total batch time and GPU utilization
|
||
# GPU util = t_compute / (t_compute + max(t_io - t_compute, 0) + t_preprocess)
|
||
# When IO > compute, GPU waits. When compute > IO, pipeline overlaps.
|
||
_t_wait = max(_t_io_s - _t_compute_s, 0.0)
|
||
_t_total = _t_compute_s + _t_wait + _t_preprocess_s
|
||
_gpu_util = min(_t_compute_s / _t_total, 1.0) * 100.0
|
||
|
||
# Color coding for utilization
|
||
if _gpu_util >= 80:
|
||
_util_color = COLORS["GreenLine"]
|
||
_util_label = "Healthy"
|
||
elif _gpu_util >= 50:
|
||
_util_color = COLORS["OrangeLine"]
|
||
_util_label = "Degraded"
|
||
else:
|
||
_util_color = COLORS["RedLine"]
|
||
_util_label = "Starved"
|
||
|
||
# Scale to ms for display
|
||
_t_compute_ms = _t_compute_s * 1000
|
||
_t_io_ms = _t_io_s * 1000
|
||
_t_preprocess_ms = _t_preprocess_s * 1000
|
||
_t_total_ms = _t_total * 1000
|
||
|
||
# ── Timeline bar chart ────────────────────────────────────────────────────
|
||
_fig = go.Figure()
|
||
|
||
_fig.add_trace(go.Bar(
|
||
name="GPU Compute",
|
||
x=[_t_compute_ms],
|
||
y=["Batch Timeline"],
|
||
orientation="h",
|
||
marker_color=COLORS["GreenLine"],
|
||
text=[f"GPU Compute<br>{_t_compute_ms:.2f} ms"],
|
||
textposition="inside",
|
||
insidetextanchor="middle",
|
||
))
|
||
_fig.add_trace(go.Bar(
|
||
name="Data Loading (I/O)",
|
||
x=[max(_t_io_ms - _t_compute_ms, 0)],
|
||
y=["Batch Timeline"],
|
||
orientation="h",
|
||
marker_color=COLORS["RedLine"],
|
||
text=[f"I/O Wait<br>{max(_t_io_ms - _t_compute_ms, 0):.2f} ms"],
|
||
textposition="inside",
|
||
insidetextanchor="middle",
|
||
))
|
||
_fig.add_trace(go.Bar(
|
||
name="Preprocessing",
|
||
x=[_t_preprocess_ms],
|
||
y=["Batch Timeline"],
|
||
orientation="h",
|
||
marker_color=COLORS["OrangeLine"],
|
||
text=[f"Preprocess<br>{_t_preprocess_ms:.2f} ms"],
|
||
textposition="inside",
|
||
insidetextanchor="middle",
|
||
))
|
||
|
||
_fig.update_layout(
|
||
barmode="stack",
|
||
height=160,
|
||
xaxis_title="Time per batch (ms)",
|
||
showlegend=True,
|
||
legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="left", x=0),
|
||
margin=dict(l=10, r=10, t=40, b=40),
|
||
title=dict(text=f"Batch Timeline — Total: {_t_total_ms:.2f} ms", font=dict(size=13)),
|
||
)
|
||
apply_plotly_theme(_fig)
|
||
|
||
# ── Metric cards ──────────────────────────────────────────────────────────
|
||
_cards_html = f"""
|
||
<div style="display: flex; gap: 16px; flex-wrap: wrap; margin: 16px 0;">
|
||
<div style="padding: 18px 22px; border: 1px solid #e2e8f0; border-radius: 12px;
|
||
min-width: 160px; text-align: center; background: white;
|
||
border-top: 4px solid {_util_color};">
|
||
<div style="color: #94a3b8; font-size: 0.82rem; font-weight: 600; margin-bottom: 6px;">
|
||
GPU Utilization
|
||
</div>
|
||
<div style="font-size: 2.2rem; font-weight: 800; color: {_util_color}; line-height: 1;">
|
||
{_gpu_util:.0f}%
|
||
</div>
|
||
<div style="color: {_util_color}; font-size: 0.78rem; font-weight: 700; margin-top: 4px;">
|
||
{_util_label}
|
||
</div>
|
||
</div>
|
||
<div style="padding: 18px 22px; border: 1px solid #e2e8f0; border-radius: 12px;
|
||
min-width: 160px; text-align: center; background: white;">
|
||
<div style="color: #94a3b8; font-size: 0.82rem; font-weight: 600; margin-bottom: 6px;">
|
||
GPU Compute
|
||
</div>
|
||
<div style="font-size: 2.2rem; font-weight: 800; color: {COLORS['GreenLine']}; line-height: 1;">
|
||
{_t_compute_ms:.2f} ms
|
||
</div>
|
||
<div style="color: #94a3b8; font-size: 0.78rem; margin-top: 4px;">per batch</div>
|
||
</div>
|
||
<div style="padding: 18px 22px; border: 1px solid #e2e8f0; border-radius: 12px;
|
||
min-width: 160px; text-align: center; background: white;">
|
||
<div style="color: #94a3b8; font-size: 0.82rem; font-weight: 600; margin-bottom: 6px;">
|
||
I/O Wait
|
||
</div>
|
||
<div style="font-size: 2.2rem; font-weight: 800; color: {_util_color}; line-height: 1;">
|
||
{_t_io_ms:.2f} ms
|
||
</div>
|
||
<div style="color: #94a3b8; font-size: 0.78rem; margin-top: 4px;">per batch</div>
|
||
</div>
|
||
<div style="padding: 18px 22px; border: 1px solid #e2e8f0; border-radius: 12px;
|
||
min-width: 160px; text-align: center; background: white;">
|
||
<div style="color: #94a3b8; font-size: 0.82rem; font-weight: 600; margin-bottom: 6px;">
|
||
Storage BW
|
||
</div>
|
||
<div style="font-size: 2.2rem; font-weight: 800; color: {COLORS['BlueLine']}; line-height: 1;">
|
||
{_effective_bw:.2f} GB/s
|
||
</div>
|
||
<div style="color: #94a3b8; font-size: 0.78rem; margin-top: 4px;">
|
||
{act1_num_workers.value} workers
|
||
</div>
|
||
</div>
|
||
</div>
|
||
"""
|
||
|
||
# ── Physics formula display ───────────────────────────────────────────────
|
||
_formula_md = f"""
|
||
**The physics (from @sec-data-engineering-feeding-problem):**
|
||
|
||
```
|
||
GPU Utilization = t_compute / (t_compute + t_io_wait + t_preprocess)
|
||
= {_t_compute_ms:.2f} ms / ({_t_compute_ms:.2f} + {max(_t_io_ms - _t_compute_ms, 0):.2f} + {_t_preprocess_ms:.2f}) ms
|
||
= {_gpu_util:.1f}%
|
||
|
||
Effective BW = Storage_BW × worker_factor
|
||
= {_bw:.2f} GB/s × {min(act1_num_workers.value, 8) / 4.0:.2f}
|
||
= {_effective_bw:.2f} GB/s
|
||
|
||
I/O Time = batch_bytes / effective_BW
|
||
= {_batch_bytes_gb * 1024:.1f} MB / {_effective_bw * 1024:.0f} MB/s
|
||
= {_t_io_ms:.2f} ms
|
||
```
|
||
"""
|
||
|
||
mo.vstack([
|
||
mo.Html(_cards_html),
|
||
mo.as_html(_fig),
|
||
mo.md(_formula_md),
|
||
])
|
||
return (
|
||
_gpu_util,
|
||
_t_compute_ms,
|
||
_t_io_ms,
|
||
_effective_bw,
|
||
_bw,
|
||
)
|
||
|
||
|
||
# ─── ACT I: REVEAL ───────────────────────────────────────────────────────────
|
||
@app.cell(hide_code=True)
|
||
def _(mo, act1_prediction, _gpu_util):
|
||
# Prediction-vs-reality overlay
|
||
_pred_text = {
|
||
"A": "GPU FLOPS insufficient",
|
||
"B": "Data loading bottleneck (I/O bound)",
|
||
"C": "Model too small",
|
||
"D": "Learning rate too high",
|
||
}[act1_prediction.value]
|
||
|
||
_is_correct = act1_prediction.value == "B"
|
||
_actual_gpu_util = _gpu_util
|
||
|
||
if _is_correct:
|
||
mo.callout(mo.md(
|
||
f"**Correct. You predicted: {_pred_text}.**\n\n"
|
||
f"The GPU is sitting at **{_actual_gpu_util:.0f}% utilization** not because "
|
||
"it lacks FLOPS, but because it finishes each batch computation in "
|
||
"~0.84 ms while the HDD requires ~3.6 ms to load the next batch. "
|
||
"The GPU is I/O-bound: it spends 77% of its wall-clock time waiting "
|
||
"for the data pipeline to deliver the next 64 images. "
|
||
"Adding more GPUs would make the problem worse — each additional GPU "
|
||
"would compete for the same storage bandwidth."
|
||
), kind="success")
|
||
elif act1_prediction.value == "A":
|
||
mo.callout(mo.md(
|
||
f"**Not quite. You predicted: {_pred_text}.**\n\n"
|
||
f"The GPU is at **{_actual_gpu_util:.0f}% utilization** but this is not "
|
||
"because it lacks FLOPS. An A100 can process a ResNet-50 batch in ~0.84 ms. "
|
||
"The bottleneck is that an HDD delivers only 0.15 GB/s — loading the same "
|
||
"batch takes ~3.6 ms. The GPU completes its work, then waits. "
|
||
"The 'fix' of adding GPUs would only increase I/O contention. "
|
||
"**Correct answer: B — the pipeline is I/O-bound.**"
|
||
), kind="warn")
|
||
elif act1_prediction.value == "C":
|
||
mo.callout(mo.md(
|
||
f"**Not quite. You predicted: {_pred_text}.**\n\n"
|
||
f"The GPU is at **{_actual_gpu_util:.0f}% utilization** because it is waiting "
|
||
"for data, not because the model is too simple. ResNet-50 requires 4.1 GFLOPs "
|
||
"per image — this is not a trivial model. Even a 50-layer ResNet finishes its "
|
||
"batch in 0.84 ms on an A100, which is then idle for 2.76 ms waiting for the "
|
||
"HDD. Model complexity is irrelevant when the bottleneck is I/O. "
|
||
"**Correct answer: B — the pipeline is I/O-bound.**"
|
||
), kind="warn")
|
||
else:
|
||
mo.callout(mo.md(
|
||
f"**Not quite. You predicted: {_pred_text}.**\n\n"
|
||
f"Learning rate affects convergence quality, not hardware utilization. "
|
||
f"The GPU is at **{_actual_gpu_util:.0f}% utilization** because it finishes "
|
||
"computing in 0.84 ms and then idles for 2.76 ms waiting for the storage "
|
||
"system to load the next batch. This is a physical bottleneck in the "
|
||
"data pipeline — it has nothing to do with the optimization algorithm. "
|
||
"**Correct answer: B — the pipeline is I/O-bound.**"
|
||
), kind="warn")
|
||
return
|
||
|
||
|
||
# ─── ACT I: REFLECTION ───────────────────────────────────────────────────────
|
||
@app.cell(hide_code=True)
|
||
def _(mo):
|
||
mo.md("""
|
||
#### Reflection
|
||
|
||
You have seen that HDD storage produces 23% GPU utilization while NVMe reaches 78%.
|
||
Now commit to the correct engineering response.
|
||
""")
|
||
return
|
||
|
||
|
||
@app.cell(hide_code=True)
|
||
def _(mo):
|
||
act1_reflection = mo.ui.radio(
|
||
options={
|
||
"A) Add more GPUs to the training cluster": "A",
|
||
"B) Increase batch size from 64 to 512 to reduce I/O frequency": "B",
|
||
"C) Switch to faster storage or increase num_workers to saturate the pipeline": "C",
|
||
"D) Reduce model size to lower compute time per batch": "D",
|
||
},
|
||
label="What is the correct fix for 23% GPU utilization caused by data loading?",
|
||
)
|
||
act1_reflection
|
||
return (act1_reflection,)
|
||
|
||
|
||
@app.cell(hide_code=True)
|
||
def _(mo, act1_reflection):
|
||
mo.stop(
|
||
act1_reflection.value is None,
|
||
mo.callout(mo.md("Select your answer to continue."), kind="warn"),
|
||
)
|
||
|
||
if act1_reflection.value == "C":
|
||
mo.callout(mo.md(
|
||
"**Correct.** Faster storage (NVMe: 3.5 GB/s vs HDD: 0.15 GB/s) eliminates "
|
||
"I/O wait directly. More DataLoader workers parallelize reads across multiple "
|
||
"CPU cores, increasing effective bandwidth. Both approaches attack the same "
|
||
"root cause: insufficient I/O throughput relative to GPU compute speed. "
|
||
"Adding GPUs (A) worsens the I/O competition. Larger batches (B) reduce "
|
||
"the *number* of I/O operations but each operation loads more data, so total "
|
||
"I/O time increases proportionally. Smaller models (D) reduce compute time, "
|
||
"which actually *lowers* utilization further by making the GPU finish even faster."
|
||
), kind="success")
|
||
elif act1_reflection.value == "A":
|
||
mo.callout(mo.md(
|
||
"**Incorrect.** Adding GPUs distributes the compute load — but the storage "
|
||
"bottleneck is shared. Each new GPU would compete for the same HDD bandwidth "
|
||
"(0.15 GB/s), reducing the effective bandwidth per GPU. You would spend more "
|
||
"money and achieve lower per-GPU utilization. The correct fix is C: faster "
|
||
"storage or more DataLoader workers."
|
||
), kind="warn")
|
||
elif act1_reflection.value == "B":
|
||
mo.callout(mo.md(
|
||
"**Partially helpful, but not the root fix.** Larger batches reduce the "
|
||
"*number* of I/O calls per epoch, but each call loads 8× more data (512 vs 64 "
|
||
"images). Total I/O bytes per epoch is unchanged, so the Feeding Tax "
|
||
"remains proportionally similar. The root cause — storage bandwidth below "
|
||
"what the GPU needs — requires faster storage (C). Larger batches also "
|
||
"affect gradient statistics and may require learning rate adjustments."
|
||
), kind="warn")
|
||
else:
|
||
mo.callout(mo.md(
|
||
"**Incorrect.** A smaller model finishes each batch computation even faster, "
|
||
"making the GPU idle for even longer while waiting for I/O. This worsens "
|
||
"the utilization metric. The problem is that storage is too slow, not that "
|
||
"the model is too slow. The correct fix is C: faster storage or more workers."
|
||
), kind="warn")
|
||
return
|
||
|
||
|
||
# ─── ACT I: MATHPEEK ─────────────────────────────────────────────────────────
|
||
@app.cell(hide_code=True)
|
||
def _(mo):
|
||
mo.accordion({
|
||
"The governing equation — GPU Utilization and the Feeding Tax": mo.md("""
|
||
**GPU Utilization** (from @sec-data-engineering-feeding-problem):
|
||
|
||
$$\\eta_{GPU} = \\frac{t_{compute}}{t_{compute} + t_{IO\\ wait} + t_{preprocess}}$$
|
||
|
||
Where:
|
||
- **t_compute** — time the GPU is executing the forward/backward pass
|
||
- **t_IO wait** — time the GPU idles waiting for the next batch from storage
|
||
- **t_preprocess** — time for CPU-side decode and augmentation
|
||
|
||
**The Feeding Tax** is the complement: `Feeding Tax = (1 - η_GPU) × 100%`
|
||
|
||
When storage bandwidth (BW_storage) is less than the GPU's required bandwidth
|
||
(BW_required = batch_bytes × GPU_throughput), the pipeline stalls:
|
||
|
||
$$t_{IO\\ wait} = \\frac{batch\\ bytes}{BW_{storage}} - t_{compute}$$
|
||
$$\\text{(positive when storage is the bottleneck, zero when GPU is the bottleneck)}$$
|
||
|
||
**Numerical example (10 TB dataset, HDD, 4 workers):**
|
||
```
|
||
t_compute = (64 × 4.1 GFLOPs) / 312 TFLOPS ≈ 0.84 ms
|
||
t_IO = (64 × 600 KB) / 150 MB/s ≈ 2.56 ms
|
||
t_IO_wait = 2.56 - 0.84 = 1.72 ms (GPU is idle)
|
||
t_preprocess = 2.56 × 0.2 = 0.51 ms
|
||
η_GPU = 0.84 / (0.84 + 1.72 + 0.51) ≈ 27% ← near the 23% observation
|
||
```
|
||
|
||
**Pipeline overlap** occurs when t_IO < t_compute: prefetching can hide I/O
|
||
latency and η_GPU → 100%. NVMe at 3.5 GB/s achieves this for ResNet-50.
|
||
"""),
|
||
})
|
||
return
|
||
|
||
|
||
# ═════════════════════════════════════════════════════════════════════════════
|
||
# ACT II: THE DATA GRAVITY CALCULATION
|
||
# ═════════════════════════════════════════════════════════════════════════════
|
||
|
||
@app.cell(hide_code=True)
|
||
def _(mo, COLORS):
|
||
_c = COLORS["Cloud"]
|
||
mo.Html(f"""
|
||
<div style="margin: 40px 0 8px 0; border-top: 2px solid #e2e8f0; padding-top: 32px;">
|
||
<div style="font-size: 0.72rem; font-weight: 700; letter-spacing: 0.14em;
|
||
text-transform: uppercase; color: {_c}; margin-bottom: 6px;">
|
||
Act II · 20–25 minutes
|
||
</div>
|
||
<div style="font-size: 1.6rem; font-weight: 800; color: #0f172a; line-height: 1.2;">
|
||
The Data Gravity Calculation
|
||
</div>
|
||
<div style="font-size: 0.95rem; color: #475569; margin-top: 6px; max-width: 700px;">
|
||
You have fixed the training pipeline. Now a new problem: the team's 50 TB
|
||
training dataset lives in AWS us-east-1. The GPUs they need are available
|
||
in us-west-2. Should they move the data, or move the compute?
|
||
</div>
|
||
</div>
|
||
""")
|
||
return
|
||
|
||
|
||
# ─── ACT II: STAKEHOLDER MESSAGE ─────────────────────────────────────────────
|
||
@app.cell(hide_code=True)
|
||
def _(mo, COLORS):
|
||
_c = COLORS["BlueLine"]
|
||
mo.Html(f"""
|
||
<div style="border-left: 4px solid {_c}; background: {COLORS['BlueLL']};
|
||
border-radius: 0 10px 10px 0; padding: 16px 22px; margin: 12px 0;">
|
||
<div style="font-size: 0.72rem; font-weight: 700; color: {_c};
|
||
text-transform: uppercase; letter-spacing: 0.1em; margin-bottom: 6px;">
|
||
Incoming Message · Cloud Infrastructure Lead
|
||
</div>
|
||
<div style="font-style: italic; font-size: 1.0rem; color: #1e293b; line-height: 1.65;">
|
||
"We have 50 TB of training data in us-east-1 and a cluster of 8 A100s
|
||
available in us-west-2. We have a 100 Gbps inter-region link.
|
||
The training run takes about 6 hours. Should we transfer the data
|
||
to us-west-2 first, or spin up compute in us-east-1?"
|
||
</div>
|
||
</div>
|
||
""")
|
||
return
|
||
|
||
|
||
# ─── ACT II: PREDICTION LOCK ─────────────────────────────────────────────────
|
||
@app.cell(hide_code=True)
|
||
def _(mo):
|
||
mo.md("""
|
||
#### Your Prediction
|
||
|
||
*Commit to an estimate before using the calculator.*
|
||
You have a 50 TB dataset in AWS us-east-1. Training GPUs are in us-west-2.
|
||
The inter-region link runs at 100 Gbps. How long does the data transfer take?
|
||
""")
|
||
return
|
||
|
||
|
||
@app.cell(hide_code=True)
|
||
def _(mo):
|
||
act2_prediction = mo.ui.radio(
|
||
options={
|
||
"A) About 1 minute — 100 Gbps is very fast": "A",
|
||
"B) About 67 minutes — 100 Gbps = 12.5 GB/s, 50 TB ÷ 12.5 GB/s ≈ 67 min": "B",
|
||
"C) About 11 hours — the practical throughput is much lower than the rated speed": "C",
|
||
"D) About 4.6 days — transfer overhead and routing make 100 Gbps unusable": "D",
|
||
},
|
||
label="Transfer time for 50 TB over a 100 Gbps inter-region link:",
|
||
)
|
||
act2_prediction
|
||
return (act2_prediction,)
|
||
|
||
|
||
@app.cell(hide_code=True)
|
||
def _(mo, act2_prediction):
|
||
mo.stop(
|
||
act2_prediction.value is None,
|
||
mo.callout(mo.md("Select your transfer time estimate to unlock the data gravity calculator."), kind="warn"),
|
||
)
|
||
mo.callout(
|
||
mo.md(
|
||
f"**Prediction locked: {act2_prediction.value[:2]}** "
|
||
"Now use the calculator to determine whether your estimate was correct — "
|
||
"and more importantly, whether transferring is the right decision at all."
|
||
),
|
||
kind="info",
|
||
)
|
||
return
|
||
|
||
|
||
# ─── ACT II: INSTRUMENTS ─────────────────────────────────────────────────────
|
||
@app.cell(hide_code=True)
|
||
def _(mo):
|
||
mo.md("""
|
||
#### The Data Gravity Calculator
|
||
|
||
Adjust the dataset size and network parameters to see when data transfer
|
||
exceeds your training time budget — and when it becomes cheaper to move
|
||
the compute instead of the data.
|
||
""")
|
||
return
|
||
|
||
|
||
@app.cell(hide_code=True)
|
||
def _(mo):
|
||
act2_dataset_gb = mo.ui.slider(
|
||
start=1, stop=6,
|
||
value=4, # log10(50 TB in GB) = log10(51200) ≈ 4.7, use 4 as default (10 TB)
|
||
step=1,
|
||
label="Dataset size (log10 GB) — 10¹ = 10 GB, 10⁶ = 1 PB",
|
||
show_value=True,
|
||
)
|
||
act2_network_bw = mo.ui.dropdown(
|
||
options={
|
||
"1 Gbps Ethernet (0.125 GB/s)": "1g",
|
||
"10 Gbps Ethernet (1.25 GB/s)": "10g",
|
||
"100 Gbps Ethernet (12.5 GB/s)": "100g",
|
||
},
|
||
value="100 Gbps Ethernet (12.5 GB/s)",
|
||
label="Network bandwidth",
|
||
)
|
||
act2_training_hours = mo.ui.slider(
|
||
start=1, stop=72, value=6, step=1,
|
||
label="Training time budget (hours)",
|
||
show_value=True,
|
||
)
|
||
mo.hstack([
|
||
act2_dataset_gb,
|
||
act2_network_bw,
|
||
act2_training_hours,
|
||
], gap=2, justify="start")
|
||
return (act2_dataset_gb, act2_network_bw, act2_training_hours)
|
||
|
||
|
||
@app.cell(hide_code=True)
|
||
def _(mo, act2_dataset_gb, act2_network_bw, act2_training_hours, context_toggle, go, apply_plotly_theme, COLORS, math):
|
||
# ── Physics engine ────────────────────────────────────────────────────────
|
||
# Source: DataGravity class in data_engineering.qmd
|
||
# T_transfer = D_vol / BW (from @sec-data-engineering-data-gravity-adcb)
|
||
# AWS egress: $0.08/GB (DataGravity.egress_cost_per_gb_str)
|
||
|
||
_dataset_gb_val = 10 ** act2_dataset_gb.value
|
||
|
||
# Network bandwidth (GB/s) by tier
|
||
_net_bw_map = {
|
||
"1g": 0.125, # 1 Gbps = 0.125 GB/s
|
||
"10g": 1.25, # 10 Gbps = 1.25 GB/s
|
||
"100g": 12.5, # 100 Gbps = 12.5 GB/s
|
||
}
|
||
_net_bw_gbs = _net_bw_map[act2_network_bw.value]
|
||
|
||
# Context-dependent cost: cloud has egress, edge has none
|
||
_ctx = context_toggle.value
|
||
if _ctx == "cloud":
|
||
_egress_cost_per_gb = 0.08 # AWS egress, DataGravity.egress_cost_per_gb_str
|
||
else:
|
||
_egress_cost_per_gb = 0.00 # local LAN, no egress fee
|
||
|
||
# Transfer calculations
|
||
_transfer_seconds = _dataset_gb_val / _net_bw_gbs
|
||
_transfer_hours = _transfer_seconds / 3600
|
||
_transfer_cost = _dataset_gb_val * _egress_cost_per_gb
|
||
|
||
# Training budget
|
||
_training_hours = act2_training_hours.value
|
||
|
||
# Failure state: transfer > training budget
|
||
_transfer_exceeds_training = _transfer_hours > _training_hours
|
||
|
||
# Decision metric: compare transfer cost to compute-in-place cost
|
||
# Approximate: spinning up equivalent compute in source region
|
||
# A100 spot price ~$2.50/GPU-hr, 8 GPUs
|
||
_compute_spot_cost_per_hour = 20.0 # $20/hr for 8× A100 spot
|
||
_compute_cost_to_stay = _compute_spot_cost_per_hour * _training_hours
|
||
_total_transfer_cost = _transfer_cost # (ignoring compute differential for clarity)
|
||
|
||
# Format dataset size for display
|
||
if _dataset_gb_val >= 1e6:
|
||
_ds_label = f"{_dataset_gb_val/1e6:.1f} PB"
|
||
elif _dataset_gb_val >= 1e3:
|
||
_ds_label = f"{_dataset_gb_val/1e3:.1f} TB"
|
||
else:
|
||
_ds_label = f"{_dataset_gb_val:.0f} GB"
|
||
|
||
# Format transfer time for display
|
||
if _transfer_hours >= 24:
|
||
_time_label = f"{_transfer_hours/24:.1f} days"
|
||
elif _transfer_hours >= 1:
|
||
_time_label = f"{_transfer_hours:.1f} hours"
|
||
else:
|
||
_time_label = f"{_transfer_hours * 60:.0f} minutes"
|
||
|
||
# ── Bar chart: transfer time vs training budget ───────────────────────────
|
||
_transfer_color = COLORS["RedLine"] if _transfer_exceeds_training else COLORS["GreenLine"]
|
||
_train_color = COLORS["BlueLine"]
|
||
|
||
_fig2 = go.Figure()
|
||
_fig2.add_trace(go.Bar(
|
||
name="Data Transfer",
|
||
x=["Time Comparison (hours)"],
|
||
y=[_transfer_hours],
|
||
marker_color=_transfer_color,
|
||
text=[f"{_transfer_hours:.1f}h"],
|
||
textposition="outside",
|
||
width=0.3,
|
||
))
|
||
_fig2.add_trace(go.Bar(
|
||
name="Training Budget",
|
||
x=["Time Comparison (hours)"],
|
||
y=[_training_hours],
|
||
marker_color=_train_color,
|
||
text=[f"{_training_hours}h budget"],
|
||
textposition="outside",
|
||
width=0.3,
|
||
))
|
||
_fig2.update_layout(
|
||
barmode="group",
|
||
height=280,
|
||
yaxis_title="Hours",
|
||
showlegend=True,
|
||
margin=dict(l=40, r=20, t=40, b=40),
|
||
title=dict(text=f"Transfer vs Training: {_ds_label} over {act2_network_bw.value[:6]}", font=dict(size=13)),
|
||
)
|
||
apply_plotly_theme(_fig2)
|
||
|
||
# ── Metric cards ──────────────────────────────────────────────────────────
|
||
_cost_color = COLORS["RedLine"] if _transfer_cost > _compute_cost_to_stay else COLORS["GreenLine"]
|
||
|
||
_cards2_html = f"""
|
||
<div style="display: flex; gap: 16px; flex-wrap: wrap; margin: 16px 0;">
|
||
<div style="padding: 18px 22px; border: 1px solid #e2e8f0; border-radius: 12px;
|
||
min-width: 155px; text-align: center; background: white;
|
||
border-top: 4px solid {_transfer_color};">
|
||
<div style="color: #94a3b8; font-size: 0.82rem; font-weight: 600; margin-bottom: 6px;">
|
||
Transfer Time
|
||
</div>
|
||
<div style="font-size: 1.9rem; font-weight: 800; color: {_transfer_color}; line-height: 1;">
|
||
{_time_label}
|
||
</div>
|
||
<div style="color: #94a3b8; font-size: 0.78rem; margin-top: 4px;">
|
||
{_ds_label} at {_net_bw_gbs} GB/s
|
||
</div>
|
||
</div>
|
||
<div style="padding: 18px 22px; border: 1px solid #e2e8f0; border-radius: 12px;
|
||
min-width: 155px; text-align: center; background: white;
|
||
border-top: 4px solid {_cost_color};">
|
||
<div style="color: #94a3b8; font-size: 0.82rem; font-weight: 600; margin-bottom: 6px;">
|
||
Egress Cost
|
||
</div>
|
||
<div style="font-size: 1.9rem; font-weight: 800; color: {_cost_color}; line-height: 1;">
|
||
${_transfer_cost:,.0f}
|
||
</div>
|
||
<div style="color: #94a3b8; font-size: 0.78rem; margin-top: 4px;">
|
||
@ ${_egress_cost_per_gb:.2f}/GB
|
||
</div>
|
||
</div>
|
||
<div style="padding: 18px 22px; border: 1px solid #e2e8f0; border-radius: 12px;
|
||
min-width: 155px; text-align: center; background: white;
|
||
border-top: 4px solid {COLORS['BlueLine']};">
|
||
<div style="color: #94a3b8; font-size: 0.82rem; font-weight: 600; margin-bottom: 6px;">
|
||
Compute-in-Place
|
||
</div>
|
||
<div style="font-size: 1.9rem; font-weight: 800; color: {COLORS['BlueLine']}; line-height: 1;">
|
||
${_compute_cost_to_stay:,.0f}
|
||
</div>
|
||
<div style="color: #94a3b8; font-size: 0.78rem; margin-top: 4px;">
|
||
8× A100 spot × {_training_hours}h
|
||
</div>
|
||
</div>
|
||
</div>
|
||
"""
|
||
|
||
# ── Physics formula display ───────────────────────────────────────────────
|
||
_formula2_md = f"""
|
||
**The physics (from @sec-data-engineering-data-gravity-adcb):**
|
||
|
||
```
|
||
T_transfer = D_vol / BW
|
||
= {_dataset_gb_val:,.0f} GB / {_net_bw_gbs} GB/s
|
||
= {_transfer_seconds:,.0f} s
|
||
= {_time_label}
|
||
|
||
Egress cost = D_vol × $0.08/GB
|
||
= {_dataset_gb_val:,.0f} × $0.08
|
||
= ${_transfer_cost:,.0f}
|
||
```
|
||
"""
|
||
|
||
mo.vstack([
|
||
mo.Html(_cards2_html),
|
||
mo.as_html(_fig2),
|
||
mo.md(_formula2_md),
|
||
])
|
||
return (
|
||
_transfer_exceeds_training,
|
||
_transfer_hours,
|
||
_training_hours,
|
||
_transfer_cost,
|
||
_compute_cost_to_stay,
|
||
_time_label,
|
||
_ds_label,
|
||
_transfer_color,
|
||
)
|
||
|
||
|
||
# ─── ACT II: FAILURE STATE ───────────────────────────────────────────────────
|
||
@app.cell(hide_code=True)
|
||
def _(mo, _transfer_exceeds_training, _transfer_hours, _training_hours, _time_label, _ds_label):
|
||
if _transfer_exceeds_training:
|
||
mo.callout(mo.md(
|
||
f"**Data transfer ({_time_label}) exceeds training budget ({_training_hours} hours). "
|
||
f"Moving {_ds_label} over the network costs more time than the training run itself.**\n\n"
|
||
"This is the **data gravity trap**: at this scale, compute must move to the data. "
|
||
"Options: (1) spin up training in the same region as the data, "
|
||
"(2) use a Data Lakehouse — run the training job directly on the storage node, "
|
||
"or (3) upgrade to a faster network link. "
|
||
"Pull the dataset size or training budget slider to find the breakeven point."
|
||
), kind="danger")
|
||
else:
|
||
_ratio = _transfer_hours / _training_hours
|
||
if _ratio > 0.5:
|
||
mo.callout(mo.md(
|
||
f"**Transfer feasible but costly: {_time_label} is {_ratio*100:.0f}% of your "
|
||
f"{_training_hours}-hour training budget.**\n\n"
|
||
"Data is approaching the gravity threshold. A small increase in dataset size "
|
||
"or reduction in training time will trigger the trap. Consider co-locating "
|
||
"compute with data as a proactive architectural decision."
|
||
), kind="warn")
|
||
else:
|
||
mo.callout(mo.md(
|
||
f"**Transfer is viable: {_time_label} is well within the {_training_hours}-hour budget.**\n\n"
|
||
"At this scale, data transfer is not the bottleneck. Data gravity has not yet "
|
||
"trapped this workload. Increase the dataset size to find where the physics "
|
||
"forces the architectural switch from 'move data' to 'move compute.'"
|
||
), kind="success")
|
||
return
|
||
|
||
|
||
# ─── ACT II: PREDICTION REVEAL ───────────────────────────────────────────────
|
||
@app.cell(hide_code=True)
|
||
def _(mo, act2_prediction):
|
||
# 50 TB at 100 Gbps: 50,000 GB / 12.5 GB/s = 4,000 s = 66.7 min ≈ 67 min
|
||
_actual_minutes = 67
|
||
_actual_cost = 50_000 * 0.08 # 50 TB × $0.08/GB = $4,000
|
||
|
||
_predicted = {
|
||
"A": 1,
|
||
"B": 67,
|
||
"C": 660,
|
||
"D": 6624,
|
||
}[act2_prediction.value]
|
||
|
||
_ratio = _actual_minutes / _predicted if _predicted > 0 else float("inf")
|
||
_is_correct = act2_prediction.value == "B"
|
||
|
||
if _is_correct:
|
||
mo.callout(mo.md(
|
||
f"**Correct. You predicted ~{_predicted} minutes. The actual transfer time is ~{_actual_minutes} minutes.**\n\n"
|
||
f"50 TB ÷ 12.5 GB/s = 4,000 s = **{_actual_minutes} minutes**. "
|
||
f"Plus the egress cost: 50,000 GB × $0.08 = **$4,000**. "
|
||
"The transfer is feasible for a 6-hour training run — but $4,000 in egress "
|
||
"may exceed the cost of spinning up equivalent compute in us-east-1."
|
||
), kind="success")
|
||
elif act2_prediction.value == "A":
|
||
mo.callout(mo.md(
|
||
f"**You were off by {_ratio:.0f}×. You predicted ~{_predicted} minute. "
|
||
f"The actual transfer time is ~{_actual_minutes} minutes.**\n\n"
|
||
"100 Gbps sounds fast, but it equals only 12.5 GB/s. Dividing 50 TB "
|
||
"(= 50,000 GB) by 12.5 GB/s gives 4,000 seconds = **67 minutes**. "
|
||
"Sustained 100 Gbps is rare in practice; real transfers are slower."
|
||
), kind="warn")
|
||
elif act2_prediction.value == "C":
|
||
mo.callout(mo.md(
|
||
f"**You were off by {1/_ratio:.1f}×. You predicted ~{_predicted} minutes. "
|
||
f"The actual transfer time is ~{_actual_minutes} minutes.**\n\n"
|
||
"The calculation: 50 TB ÷ 12.5 GB/s = 4,000 s = **67 minutes**. "
|
||
"A sustained 100 Gbps connection is fast enough to transfer 50 TB in "
|
||
"just over an hour. The bottleneck becomes cost ($4,000 egress), not time."
|
||
), kind="warn")
|
||
else:
|
||
mo.callout(mo.md(
|
||
f"**You were off by {1/_ratio:.1f}×. You predicted ~{_predicted/60:.1f} hours. "
|
||
f"The actual transfer time is ~{_actual_minutes} minutes.**\n\n"
|
||
"At 100 Gbps = 12.5 GB/s: 50 TB ÷ 12.5 GB/s = 4,000 s = **67 minutes**. "
|
||
"The 100 Gbps link is genuinely fast. Data gravity at this scale is "
|
||
"primarily an *economic* problem (egress cost) rather than a time problem."
|
||
), kind="warn")
|
||
return
|
||
|
||
|
||
# ─── ACT II: DECISION COMPARISON ─────────────────────────────────────────────
|
||
@app.cell(hide_code=True)
|
||
def _(mo, _transfer_cost, _compute_cost_to_stay, _time_label, _ds_label):
|
||
_decision = "Move compute to data" if _transfer_cost > _compute_cost_to_stay else "Move data to compute"
|
||
_decision_color = "#CB202D" if _transfer_cost > _compute_cost_to_stay else "#008F45"
|
||
|
||
mo.Html(f"""
|
||
<div style="background: #f8fafc; border: 1.5px solid #e2e8f0; border-radius: 12px;
|
||
padding: 20px 24px; margin: 16px 0;">
|
||
<div style="font-size: 0.82rem; font-weight: 700; color: #475569; text-transform: uppercase;
|
||
letter-spacing: 0.08em; margin-bottom: 12px;">
|
||
Architecture Decision
|
||
</div>
|
||
<div style="display: flex; gap: 24px; flex-wrap: wrap; align-items: center;">
|
||
<div style="flex: 1; min-width: 200px;">
|
||
<div style="font-size: 0.88rem; color: #64748b; margin-bottom: 4px;">
|
||
Transfer data ({_ds_label}) to compute
|
||
</div>
|
||
<div style="font-size: 1.3rem; font-weight: 700; color: #0f172a;">
|
||
{_time_label} + ${_transfer_cost:,.0f} egress
|
||
</div>
|
||
</div>
|
||
<div style="font-size: 1.4rem; color: #94a3b8; font-weight: 300;">vs</div>
|
||
<div style="flex: 1; min-width: 200px;">
|
||
<div style="font-size: 0.88rem; color: #64748b; margin-bottom: 4px;">
|
||
Spin up compute where data lives
|
||
</div>
|
||
<div style="font-size: 1.3rem; font-weight: 700; color: #0f172a;">
|
||
$0 transfer + ${_compute_cost_to_stay:,.0f} compute
|
||
</div>
|
||
</div>
|
||
<div style="background: {_decision_color}; color: white; padding: 10px 20px;
|
||
border-radius: 8px; font-weight: 700; font-size: 0.95rem; white-space: nowrap;">
|
||
{_decision}
|
||
</div>
|
||
</div>
|
||
</div>
|
||
""")
|
||
return
|
||
|
||
|
||
# ─── ACT II: REFLECTION ──────────────────────────────────────────────────────
|
||
@app.cell(hide_code=True)
|
||
def _(mo):
|
||
mo.md("""
|
||
#### Reflection
|
||
|
||
You have seen data gravity in action. Now identify the principle it demonstrates.
|
||
""")
|
||
return
|
||
|
||
|
||
@app.cell(hide_code=True)
|
||
def _(mo):
|
||
act2_reflection = mo.ui.radio(
|
||
options={
|
||
"A) Large datasets are heavy to store and require expensive hardware": "A",
|
||
"B) Compute naturally migrates toward large datasets because transfer cost exceeds compute cost": "B",
|
||
"C) Data should always be compressed before training to reduce transfer time": "C",
|
||
"D) Cloud infrastructure is always faster than edge for data-intensive workloads": "D",
|
||
},
|
||
label="Data gravity means:",
|
||
)
|
||
act2_reflection
|
||
return (act2_reflection,)
|
||
|
||
|
||
@app.cell(hide_code=True)
|
||
def _(mo, act2_reflection):
|
||
mo.stop(
|
||
act2_reflection.value is None,
|
||
mo.callout(mo.md("Select your answer to continue."), kind="warn"),
|
||
)
|
||
|
||
if act2_reflection.value == "B":
|
||
mo.callout(mo.md(
|
||
"**Correct.** Data gravity (from @sec-data-engineering-data-gravity-adcb) is "
|
||
"the economic and physical pressure that pushes compute toward large datasets. "
|
||
"When T_transfer = D_vol / BW exceeds the training time budget, or when egress "
|
||
"cost exceeds the cost of running compute in the data's region, it becomes "
|
||
"cheaper to bring the compute to the data. This explains the architecture of "
|
||
"Data Lakehouses — processing engines (Spark, Presto, training jobs) run "
|
||
"directly on the storage nodes where the data already resides."
|
||
), kind="success")
|
||
elif act2_reflection.value == "A":
|
||
mo.callout(mo.md(
|
||
"**Incorrect.** Data gravity is not about storage weight or hardware cost. "
|
||
"It is about *movement cost*: the time and money required to transfer data "
|
||
"across a network. A 1 PB dataset sitting in one region is not a gravity "
|
||
"problem — the gravity problem begins when you try to move it somewhere else. "
|
||
"The correct answer is B: compute migrates toward data when transfer cost "
|
||
"exceeds the cost of co-locating compute."
|
||
), kind="warn")
|
||
elif act2_reflection.value == "C":
|
||
mo.callout(mo.md(
|
||
"**Incorrect.** Compression reduces the bytes to transfer, which can reduce "
|
||
"transfer time (T = D_vol / BW — smaller D_vol, shorter T). But data gravity "
|
||
"is not the observation that compression helps. It is the observation that "
|
||
"beyond a certain scale, no amount of compression makes transfer viable — "
|
||
"compute must move to the data instead. The correct answer is B."
|
||
), kind="warn")
|
||
else:
|
||
mo.callout(mo.md(
|
||
"**Incorrect.** Cloud infrastructure is not universally faster for data workloads. "
|
||
"The key insight of data gravity is that *location relative to the data* determines "
|
||
"which infrastructure is faster. An edge device processing data locally avoids all "
|
||
"egress costs and network latency entirely. Cloud is faster only when the compute "
|
||
"is co-located with the data (same region). The correct answer is B."
|
||
), kind="warn")
|
||
return
|
||
|
||
|
||
# ─── ACT II: MATHPEEK ────────────────────────────────────────────────────────
|
||
@app.cell(hide_code=True)
|
||
def _(mo):
|
||
mo.accordion({
|
||
"The governing equation — Data Gravity": mo.md("""
|
||
**Data Gravity** (from @sec-data-engineering-data-gravity-adcb):
|
||
|
||
$$T_{transfer} = \\frac{D_{vol}}{BW}$$
|
||
|
||
Where:
|
||
- **D_vol** — dataset volume in GB
|
||
- **BW** — network bandwidth in GB/s
|
||
- **T_transfer** — total transfer time in seconds
|
||
|
||
**The architectural decision rule** (from the DataGravity notebook):
|
||
|
||
> *If T_transfer > T_training → move compute to data*
|
||
> *If T_transfer < T_training → move data to compute*
|
||
|
||
**The economic rule** includes egress cost:
|
||
|
||
$$Cost_{transfer} = D_{vol} \\times \\$0.08/\\text{GB}$$
|
||
|
||
At petabyte scale: $10^6 \\text{ GB} \\times \\$0.08 = \\$80{,}000$ egress alone.
|
||
|
||
**Numerical example (50 TB at 100 Gbps):**
|
||
```
|
||
T_transfer = 50,000 GB / 12.5 GB/s = 4,000 s ≈ 67 minutes
|
||
Cost = 50,000 GB × $0.08 = $4,000
|
||
|
||
Rule: If training takes < 67 min → the transfer takes longer than the job.
|
||
Move compute to data.
|
||
```
|
||
|
||
**The rule of thumb** (from the DataGravity notebook in data_engineering.qmd):
|
||
- *Petabyte scale:* Code moves to Data (Data Lakehouse, in-place compute)
|
||
- *Gigabyte scale:* Data moves to Code (standard transfer is viable)
|
||
"""),
|
||
})
|
||
return
|
||
|
||
|
||
# ─── DESIGN LEDGER SAVE + HUD ─────────────────────────────────────────────────
|
||
@app.cell(hide_code=True)
|
||
def _(mo, ledger, context_toggle, act1_prediction, act1_reflection,
|
||
act2_prediction, act2_reflection, _gpu_util, _transfer_exceeds_training,
|
||
act1_storage_type, COLORS):
|
||
# Save chapter results to Design Ledger
|
||
_act1_correct = act1_prediction.value == "B" if act1_prediction.value else False
|
||
_act1_refl_correct = act1_reflection.value == "C" if act1_reflection.value else False
|
||
_act2_correct = act2_prediction.value == "B" if act2_prediction.value else False
|
||
_act2_refl_correct = act2_reflection.value == "B" if act2_reflection.value else False
|
||
|
||
ledger.save(
|
||
chapter=4,
|
||
design={
|
||
"context": context_toggle.value,
|
||
"storage_type_chosen": act1_storage_type.value if act1_storage_type.value else "hdd",
|
||
"gpu_util_at_start": round(_gpu_util, 1),
|
||
"data_gravity_triggered": bool(_transfer_exceeds_training),
|
||
"act1_correct": _act1_correct,
|
||
"act1_reflection_correct": _act1_refl_correct,
|
||
"act2_correct": _act2_correct,
|
||
"act2_reflection_correct": _act2_refl_correct,
|
||
},
|
||
)
|
||
|
||
# HUD footer
|
||
_ctx_label = "Cloud — Multi-Region" if context_toggle.value == "cloud" else "Edge — Local"
|
||
_act1_status = "correct" if _act1_correct else ("pending" if act1_prediction.value is None else "incorrect")
|
||
_act2_status = "correct" if _act2_correct else ("pending" if act2_prediction.value is None else "incorrect")
|
||
_gravity_status = "triggered" if _transfer_exceeds_training else "not triggered"
|
||
|
||
def _status_color(s):
|
||
return {"correct": "#4ade80", "pending": "#94a3b8", "incorrect": "#f87171", "triggered": "#f87171", "not triggered": "#4ade80"}.get(s, "#94a3b8")
|
||
|
||
mo.Html(f"""
|
||
<div style="display: flex; gap: 24px; align-items: center; flex-wrap: wrap;
|
||
padding: 14px 24px; background: #0f172a; border-radius: 12px;
|
||
margin-top: 40px; font-family: 'SF Mono', monospace; font-size: 0.8rem;
|
||
border: 1px solid #1e293b;">
|
||
<span style="color: #94a3b8; font-weight: 600; letter-spacing: 0.06em;">
|
||
DESIGN LEDGER · CH04
|
||
</span>
|
||
<span style="color: #475569;">|</span>
|
||
<span>
|
||
<span style="color: #94a3b8;">Context: </span>
|
||
<span style="color: #e2e8f0;">{_ctx_label}</span>
|
||
</span>
|
||
<span>
|
||
<span style="color: #94a3b8;">Act I prediction: </span>
|
||
<span style="color: {_status_color(_act1_status)};">{_act1_status}</span>
|
||
</span>
|
||
<span>
|
||
<span style="color: #94a3b8;">Act II prediction: </span>
|
||
<span style="color: {_status_color(_act2_status)};">{_act2_status}</span>
|
||
</span>
|
||
<span>
|
||
<span style="color: #94a3b8;">Gravity trap: </span>
|
||
<span style="color: {_status_color(_gravity_status)};">{_gravity_status}</span>
|
||
</span>
|
||
<span>
|
||
<span style="color: #94a3b8;">GPU util (initial): </span>
|
||
<span style="color: #e2e8f0;">{_gpu_util:.0f}%</span>
|
||
</span>
|
||
</div>
|
||
""")
|
||
return
|
||
|
||
|
||
if __name__ == "__main__":
|
||
app.run()
|