import marimo
__generated_with = "0.19.6"
app = marimo.App(width="full")
# ─────────────────────────────────────────────────────────────────────────────
# LAB 04: THE DATA GRAVITY TRAP
#
# Chapter: data_engineering.qmd (@sec-data-engineering)
# Core Invariant: Data gravity — large datasets attract compute to their
# location. Moving data costs more than moving compute when data volume
# exceeds network bandwidth × time budget.
#
# 2-Act structure (35-40 min total):
# Act I: The Pipeline Bottleneck (12-15 min)
# The GPU is idle 77% of the time — but the team wants more GPUs.
# Prediction lock → timeline instrument → reveal → reflection → MathPeek
# Act II: The Data Gravity Calculation (20-25 min)
# 50 TB in us-east-1, training GPUs in us-west-2. Transfer or co-locate?
# Prediction lock → gravity instruments → failure state → reflection → MathPeek
#
# Deployment contexts:
# Cloud: Multi-region (100 Gbps inter-DC link, AWS egress $0.08/GB)
# Edge: Local processing (1 Gbps LAN, zero egress cost)
#
# Traceability:
# GPU utilization formula — @sec-data-engineering-feeding-problem
# Feeding tax — FeedingProblem class in data_engineering.qmd
# Data gravity T = D/BW — DataGravity class in data_engineering.qmd
# AWS egress $0.08/GB — DataGravity.egress_cost_per_gb_str
# 100 Gbps = 12.5 GB/s — DataGravity.network_gbs_str
# HDD 0.15 GB/s — Storage tier physics, @sec-data-engineering
# SSD 0.55 GB/s — Storage tier physics, @sec-data-engineering
# NVMe 3.5 GB/s — Storage tier physics, @sec-data-engineering
# RAM 50 GB/s — DRAM bandwidth, @sec-data-engineering
#
# Design Ledger save:
# chapter=4, context, storage_type_chosen, gpu_util_at_start,
# data_gravity_triggered, act1_correct, act2_correct
# ─────────────────────────────────────────────────────────────────────────────
# ─── CELL 0: SETUP (hide_code=False — leave visible) ─────────────────────────
@app.cell
def _():
import marimo as mo
import sys
import math
from pathlib import Path
import plotly.graph_objects as go
import numpy as np
_root = Path(__file__).resolve().parents[2]
if str(_root) not in sys.path:
sys.path.insert(0, str(_root))
from labs.core.state import DesignLedger
from labs.core.style import COLORS, LAB_CSS, apply_plotly_theme
ledger = DesignLedger()
return mo, ledger, COLORS, LAB_CSS, apply_plotly_theme, go, np, math
# ─── CELL 1: HEADER ──────────────────────────────────────────────────────────
@app.cell(hide_code=True)
def _(mo, LAB_CSS, COLORS):
_c = COLORS["BlueLine"]
mo.vstack([
LAB_CSS,
mo.Html(f"""
Machine Learning Systems · Volume I · Lab 04
The Data Gravity Trap
Your GPU is idle 77% of the time. The team wants to buy more hardware.
Before spending another dollar, you need to diagnose whether the bottleneck
is compute — or data movement.
Act I: Pipeline Bottleneck · 12–15 min
Act II: Data Gravity · 20–25 min
35–40 min total
Chapter 4: Data Engineering
"""),
])
return
# ─── CELL 2: RECOMMENDED READING ─────────────────────────────────────────────
@app.cell(hide_code=True)
def _(mo):
mo.callout(mo.md("""
**Recommended Reading** — Complete the following before this lab:
- **@sec-data-engineering-physics-data-cdcb** (Physics of Data) — Data gravity and
the energy-movement invariant: why moving a bit costs 100–1,000x more than computing on it.
- **@sec-data-engineering-feeding-problem** (The Feeding Problem) — The Feeding Tax,
GPU utilization formula, and why storage bandwidth determines training throughput.
- **@sec-data-engineering-data-gravity-adcb** (Data Gravity) — T = D/BW, the rule of
thumb for when to move compute vs. data, and the lakehouse architectural response.
"""), kind="info")
return
# ─── CELL 3: CONTEXT TOGGLE ──────────────────────────────────────────────────
@app.cell(hide_code=True)
def _(mo):
context_toggle = mo.ui.radio(
options={
"Cloud (multi-region, 100 Gbps inter-DC)": "cloud",
"Edge (local processing, 1 Gbps LAN)": "edge",
},
value="Cloud (multi-region, 100 Gbps inter-DC)",
label="Deployment context:",
inline=True,
)
mo.vstack([
mo.md("---"),
mo.md("### Select Your Deployment Context"),
mo.md(
"This choice persists across both acts. It changes the network bandwidth "
"and egress cost assumptions in the data gravity calculations."
),
context_toggle,
])
return (context_toggle,)
@app.cell(hide_code=True)
def _(mo, context_toggle, COLORS):
_ctx = context_toggle.value
if _ctx == "cloud":
_bw_desc = "100 Gbps inter-region link = 12.5 GB/s"
_cost_desc = "AWS egress: $0.08/GB"
_color = COLORS["Cloud"]
_label = "Cloud — Multi-Region"
else:
_bw_desc = "1 Gbps LAN = 0.125 GB/s"
_cost_desc = "Local network: $0.00/GB"
_color = COLORS["Edge"]
_label = "Edge — Local Processing"
mo.callout(mo.md(
f"**Context: {_label}** — Network bandwidth: {_bw_desc}. Transfer cost: {_cost_desc}. "
f"The data gravity calculation in Act II will use these parameters."
), kind="info")
return
# ═════════════════════════════════════════════════════════════════════════════
# ACT I: THE PIPELINE BOTTLENECK
# ═════════════════════════════════════════════════════════════════════════════
@app.cell(hide_code=True)
def _(mo, COLORS):
_c = COLORS["BlueLine"]
mo.Html(f"""
Act I · 12–15 minutes
The Pipeline Bottleneck
A team of ML engineers is training a computer vision model. Their GPU utilization
has been sitting at 23% for three days. Before you touch a single slider,
you need to commit to a diagnosis.
""")
return
# ─── ACT I: STAKEHOLDER MESSAGE ──────────────────────────────────────────────
@app.cell(hide_code=True)
def _(mo, COLORS):
_c = COLORS["OrangeLine"]
mo.Html(f"""
Incoming Message · ML Engineering Lead
"We've been training ResNet-50 on a 10 TB image dataset for 3 days.
GPU utilization is stuck at 23%. The model is clearly too complex for
our hardware — we need to request 4× more GPUs before Friday's deadline.
Can you sign off on the procurement request?"
""")
return
# ─── ACT I: PREDICTION LOCK ──────────────────────────────────────────────────
@app.cell(hide_code=True)
def _(mo):
mo.md("""
#### Your Prediction
*Before interacting with the simulator, commit to your diagnosis.*
GPU utilization is 23% during training. The team believes this means the GPU
cannot keep up with the model's compute demands.
""")
return
@app.cell(hide_code=True)
def _(mo):
act1_prediction = mo.ui.radio(
options={
"A) Insufficient GPU FLOPS — the model requires more compute than the GPU provides": "A",
"B) Data loading is slower than GPU computation — the GPU is starving for input": "B",
"C) The model is too small — it does not utilize the GPU's parallel units": "C",
"D) The learning rate is too high — training is unstable and wasting cycles": "D",
},
label="GPU utilization is 23% during training. The most likely bottleneck is:",
)
act1_prediction
return (act1_prediction,)
@app.cell(hide_code=True)
def _(mo, act1_prediction):
mo.stop(
act1_prediction.value is None,
mo.callout(
mo.md("Select your prediction to unlock the pipeline simulator."),
kind="warn",
),
)
mo.callout(
mo.md(
f"**Prediction locked: {act1_prediction.value[:2]}** "
"Now explore the simulator to test your hypothesis."
),
kind="info",
)
return
# ─── ACT I: INSTRUMENTS ──────────────────────────────────────────────────────
@app.cell(hide_code=True)
def _(mo):
mo.md("""
#### The Pipeline Simulator
Adjust the parameters below to see how storage type and data loading
configuration affect GPU utilization. The timeline shows where time goes
within a single training batch.
""")
return
@app.cell(hide_code=True)
def _(mo):
act1_dataset_size_tb = mo.ui.slider(
start=1, stop=100, value=10, step=1,
label="Dataset size (TB)",
show_value=True,
)
act1_storage_type = mo.ui.dropdown(
options={
"HDD (0.15 GB/s)": "hdd",
"SSD (0.55 GB/s)": "ssd",
"NVMe (3.5 GB/s)": "nvme",
"RAM disk (50 GB/s)": "ram",
},
value="HDD (0.15 GB/s)",
label="Storage type",
)
act1_num_workers = mo.ui.slider(
start=1, stop=32, value=4, step=1,
label="DataLoader workers",
show_value=True,
)
mo.hstack([
act1_dataset_size_tb,
act1_storage_type,
act1_num_workers,
], gap=2, justify="start")
return (act1_dataset_size_tb, act1_storage_type, act1_num_workers)
@app.cell(hide_code=True)
def _(mo, act1_storage_type, act1_num_workers, act1_dataset_size_tb, go, apply_plotly_theme, COLORS):
# ── Physics engine ────────────────────────────────────────────────────────
# Source: @sec-data-engineering-feeding-problem
# GPU compute time per batch (ResNet-50, batch=64, A100)
# ResNet-50: 4.1 GFLOPs per image, A100: 312 TFLOPS FP32
# t_compute = (64 * 4.1e9) / (312e12) ≈ 0.00084 s ≈ 0.84 ms
_BATCH_SIZE = 64
_RESNET50_GFLOPS_PER_IMG = 4.1 # GFLOPs
_A100_TFLOPS_FP32 = 312.0 # TFLOPS (A100 FP32 tensor, NVIDIA spec)
_IMG_SIZE_MB = (224 * 224 * 3 * 4) / (1024 * 1024) # 224x224 RGB FP32
_t_compute_s = (_BATCH_SIZE * _RESNET50_GFLOPS_PER_IMG * 1e9) / (_A100_TFLOPS_FP32 * 1e12)
# Storage bandwidth (GB/s) — source: @sec-data-engineering storage tier data
_storage_bw = {
"hdd": 0.15, # HDD sequential read, GB/s
"ssd": 0.55, # SATA SSD, GB/s
"nvme": 3.5, # NVMe PCIe 4.0, GB/s
"ram": 50.0, # DRAM, GB/s
}
_bw = _storage_bw[act1_storage_type.value]
# Effective bandwidth scales with num_workers (diminishing returns after ~8)
_worker_factor = min(act1_num_workers.value, 8) / 4.0
_effective_bw = _bw * min(_worker_factor, 2.0)
# I/O time per batch: bytes to load / effective_bandwidth
_batch_bytes_gb = _BATCH_SIZE * _IMG_SIZE_MB / 1024
_t_io_s = _batch_bytes_gb / _effective_bw
# Preprocessing: fixed 0.2× of IO time (decode, augment)
_t_preprocess_s = _t_io_s * 0.2
# Total batch time and GPU utilization
# GPU util = t_compute / (t_compute + max(t_io - t_compute, 0) + t_preprocess)
# When IO > compute, GPU waits. When compute > IO, pipeline overlaps.
_t_wait = max(_t_io_s - _t_compute_s, 0.0)
_t_total = _t_compute_s + _t_wait + _t_preprocess_s
_gpu_util = min(_t_compute_s / _t_total, 1.0) * 100.0
# Color coding for utilization
if _gpu_util >= 80:
_util_color = COLORS["GreenLine"]
_util_label = "Healthy"
elif _gpu_util >= 50:
_util_color = COLORS["OrangeLine"]
_util_label = "Degraded"
else:
_util_color = COLORS["RedLine"]
_util_label = "Starved"
# Scale to ms for display
_t_compute_ms = _t_compute_s * 1000
_t_io_ms = _t_io_s * 1000
_t_preprocess_ms = _t_preprocess_s * 1000
_t_total_ms = _t_total * 1000
# ── Timeline bar chart ────────────────────────────────────────────────────
_fig = go.Figure()
_fig.add_trace(go.Bar(
name="GPU Compute",
x=[_t_compute_ms],
y=["Batch Timeline"],
orientation="h",
marker_color=COLORS["GreenLine"],
text=[f"GPU Compute
{_t_compute_ms:.2f} ms"],
textposition="inside",
insidetextanchor="middle",
))
_fig.add_trace(go.Bar(
name="Data Loading (I/O)",
x=[max(_t_io_ms - _t_compute_ms, 0)],
y=["Batch Timeline"],
orientation="h",
marker_color=COLORS["RedLine"],
text=[f"I/O Wait
{max(_t_io_ms - _t_compute_ms, 0):.2f} ms"],
textposition="inside",
insidetextanchor="middle",
))
_fig.add_trace(go.Bar(
name="Preprocessing",
x=[_t_preprocess_ms],
y=["Batch Timeline"],
orientation="h",
marker_color=COLORS["OrangeLine"],
text=[f"Preprocess
{_t_preprocess_ms:.2f} ms"],
textposition="inside",
insidetextanchor="middle",
))
_fig.update_layout(
barmode="stack",
height=160,
xaxis_title="Time per batch (ms)",
showlegend=True,
legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="left", x=0),
margin=dict(l=10, r=10, t=40, b=40),
title=dict(text=f"Batch Timeline — Total: {_t_total_ms:.2f} ms", font=dict(size=13)),
)
apply_plotly_theme(_fig)
# ── Metric cards ──────────────────────────────────────────────────────────
_cards_html = f"""
GPU Utilization
{_gpu_util:.0f}%
{_util_label}
GPU Compute
{_t_compute_ms:.2f} ms
per batch
I/O Wait
{_t_io_ms:.2f} ms
per batch
Storage BW
{_effective_bw:.2f} GB/s
{act1_num_workers.value} workers
"""
# ── Physics formula display ───────────────────────────────────────────────
_formula_md = f"""
**The physics (from @sec-data-engineering-feeding-problem):**
```
GPU Utilization = t_compute / (t_compute + t_io_wait + t_preprocess)
= {_t_compute_ms:.2f} ms / ({_t_compute_ms:.2f} + {max(_t_io_ms - _t_compute_ms, 0):.2f} + {_t_preprocess_ms:.2f}) ms
= {_gpu_util:.1f}%
Effective BW = Storage_BW × worker_factor
= {_bw:.2f} GB/s × {min(act1_num_workers.value, 8) / 4.0:.2f}
= {_effective_bw:.2f} GB/s
I/O Time = batch_bytes / effective_BW
= {_batch_bytes_gb * 1024:.1f} MB / {_effective_bw * 1024:.0f} MB/s
= {_t_io_ms:.2f} ms
```
"""
mo.vstack([
mo.Html(_cards_html),
mo.as_html(_fig),
mo.md(_formula_md),
])
return (
_gpu_util,
_t_compute_ms,
_t_io_ms,
_effective_bw,
_bw,
)
# ─── ACT I: REVEAL ───────────────────────────────────────────────────────────
@app.cell(hide_code=True)
def _(mo, act1_prediction, _gpu_util):
# Prediction-vs-reality overlay
_pred_text = {
"A": "GPU FLOPS insufficient",
"B": "Data loading bottleneck (I/O bound)",
"C": "Model too small",
"D": "Learning rate too high",
}[act1_prediction.value]
_is_correct = act1_prediction.value == "B"
_actual_gpu_util = _gpu_util
if _is_correct:
mo.callout(mo.md(
f"**Correct. You predicted: {_pred_text}.**\n\n"
f"The GPU is sitting at **{_actual_gpu_util:.0f}% utilization** not because "
"it lacks FLOPS, but because it finishes each batch computation in "
"~0.84 ms while the HDD requires ~3.6 ms to load the next batch. "
"The GPU is I/O-bound: it spends 77% of its wall-clock time waiting "
"for the data pipeline to deliver the next 64 images. "
"Adding more GPUs would make the problem worse — each additional GPU "
"would compete for the same storage bandwidth."
), kind="success")
elif act1_prediction.value == "A":
mo.callout(mo.md(
f"**Not quite. You predicted: {_pred_text}.**\n\n"
f"The GPU is at **{_actual_gpu_util:.0f}% utilization** but this is not "
"because it lacks FLOPS. An A100 can process a ResNet-50 batch in ~0.84 ms. "
"The bottleneck is that an HDD delivers only 0.15 GB/s — loading the same "
"batch takes ~3.6 ms. The GPU completes its work, then waits. "
"The 'fix' of adding GPUs would only increase I/O contention. "
"**Correct answer: B — the pipeline is I/O-bound.**"
), kind="warn")
elif act1_prediction.value == "C":
mo.callout(mo.md(
f"**Not quite. You predicted: {_pred_text}.**\n\n"
f"The GPU is at **{_actual_gpu_util:.0f}% utilization** because it is waiting "
"for data, not because the model is too simple. ResNet-50 requires 4.1 GFLOPs "
"per image — this is not a trivial model. Even a 50-layer ResNet finishes its "
"batch in 0.84 ms on an A100, which is then idle for 2.76 ms waiting for the "
"HDD. Model complexity is irrelevant when the bottleneck is I/O. "
"**Correct answer: B — the pipeline is I/O-bound.**"
), kind="warn")
else:
mo.callout(mo.md(
f"**Not quite. You predicted: {_pred_text}.**\n\n"
f"Learning rate affects convergence quality, not hardware utilization. "
f"The GPU is at **{_actual_gpu_util:.0f}% utilization** because it finishes "
"computing in 0.84 ms and then idles for 2.76 ms waiting for the storage "
"system to load the next batch. This is a physical bottleneck in the "
"data pipeline — it has nothing to do with the optimization algorithm. "
"**Correct answer: B — the pipeline is I/O-bound.**"
), kind="warn")
return
# ─── ACT I: REFLECTION ───────────────────────────────────────────────────────
@app.cell(hide_code=True)
def _(mo):
mo.md("""
#### Reflection
You have seen that HDD storage produces 23% GPU utilization while NVMe reaches 78%.
Now commit to the correct engineering response.
""")
return
@app.cell(hide_code=True)
def _(mo):
act1_reflection = mo.ui.radio(
options={
"A) Add more GPUs to the training cluster": "A",
"B) Increase batch size from 64 to 512 to reduce I/O frequency": "B",
"C) Switch to faster storage or increase num_workers to saturate the pipeline": "C",
"D) Reduce model size to lower compute time per batch": "D",
},
label="What is the correct fix for 23% GPU utilization caused by data loading?",
)
act1_reflection
return (act1_reflection,)
@app.cell(hide_code=True)
def _(mo, act1_reflection):
mo.stop(
act1_reflection.value is None,
mo.callout(mo.md("Select your answer to continue."), kind="warn"),
)
if act1_reflection.value == "C":
mo.callout(mo.md(
"**Correct.** Faster storage (NVMe: 3.5 GB/s vs HDD: 0.15 GB/s) eliminates "
"I/O wait directly. More DataLoader workers parallelize reads across multiple "
"CPU cores, increasing effective bandwidth. Both approaches attack the same "
"root cause: insufficient I/O throughput relative to GPU compute speed. "
"Adding GPUs (A) worsens the I/O competition. Larger batches (B) reduce "
"the *number* of I/O operations but each operation loads more data, so total "
"I/O time increases proportionally. Smaller models (D) reduce compute time, "
"which actually *lowers* utilization further by making the GPU finish even faster."
), kind="success")
elif act1_reflection.value == "A":
mo.callout(mo.md(
"**Incorrect.** Adding GPUs distributes the compute load — but the storage "
"bottleneck is shared. Each new GPU would compete for the same HDD bandwidth "
"(0.15 GB/s), reducing the effective bandwidth per GPU. You would spend more "
"money and achieve lower per-GPU utilization. The correct fix is C: faster "
"storage or more DataLoader workers."
), kind="warn")
elif act1_reflection.value == "B":
mo.callout(mo.md(
"**Partially helpful, but not the root fix.** Larger batches reduce the "
"*number* of I/O calls per epoch, but each call loads 8× more data (512 vs 64 "
"images). Total I/O bytes per epoch is unchanged, so the Feeding Tax "
"remains proportionally similar. The root cause — storage bandwidth below "
"what the GPU needs — requires faster storage (C). Larger batches also "
"affect gradient statistics and may require learning rate adjustments."
), kind="warn")
else:
mo.callout(mo.md(
"**Incorrect.** A smaller model finishes each batch computation even faster, "
"making the GPU idle for even longer while waiting for I/O. This worsens "
"the utilization metric. The problem is that storage is too slow, not that "
"the model is too slow. The correct fix is C: faster storage or more workers."
), kind="warn")
return
# ─── ACT I: MATHPEEK ─────────────────────────────────────────────────────────
@app.cell(hide_code=True)
def _(mo):
mo.accordion({
"The governing equation — GPU Utilization and the Feeding Tax": mo.md("""
**GPU Utilization** (from @sec-data-engineering-feeding-problem):
$$\\eta_{GPU} = \\frac{t_{compute}}{t_{compute} + t_{IO\\ wait} + t_{preprocess}}$$
Where:
- **t_compute** — time the GPU is executing the forward/backward pass
- **t_IO wait** — time the GPU idles waiting for the next batch from storage
- **t_preprocess** — time for CPU-side decode and augmentation
**The Feeding Tax** is the complement: `Feeding Tax = (1 - η_GPU) × 100%`
When storage bandwidth (BW_storage) is less than the GPU's required bandwidth
(BW_required = batch_bytes × GPU_throughput), the pipeline stalls:
$$t_{IO\\ wait} = \\frac{batch\\ bytes}{BW_{storage}} - t_{compute}$$
$$\\text{(positive when storage is the bottleneck, zero when GPU is the bottleneck)}$$
**Numerical example (10 TB dataset, HDD, 4 workers):**
```
t_compute = (64 × 4.1 GFLOPs) / 312 TFLOPS ≈ 0.84 ms
t_IO = (64 × 600 KB) / 150 MB/s ≈ 2.56 ms
t_IO_wait = 2.56 - 0.84 = 1.72 ms (GPU is idle)
t_preprocess = 2.56 × 0.2 = 0.51 ms
η_GPU = 0.84 / (0.84 + 1.72 + 0.51) ≈ 27% ← near the 23% observation
```
**Pipeline overlap** occurs when t_IO < t_compute: prefetching can hide I/O
latency and η_GPU → 100%. NVMe at 3.5 GB/s achieves this for ResNet-50.
"""),
})
return
# ═════════════════════════════════════════════════════════════════════════════
# ACT II: THE DATA GRAVITY CALCULATION
# ═════════════════════════════════════════════════════════════════════════════
@app.cell(hide_code=True)
def _(mo, COLORS):
_c = COLORS["Cloud"]
mo.Html(f"""
Act II · 20–25 minutes
The Data Gravity Calculation
You have fixed the training pipeline. Now a new problem: the team's 50 TB
training dataset lives in AWS us-east-1. The GPUs they need are available
in us-west-2. Should they move the data, or move the compute?
""")
return
# ─── ACT II: STAKEHOLDER MESSAGE ─────────────────────────────────────────────
@app.cell(hide_code=True)
def _(mo, COLORS):
_c = COLORS["BlueLine"]
mo.Html(f"""
Incoming Message · Cloud Infrastructure Lead
"We have 50 TB of training data in us-east-1 and a cluster of 8 A100s
available in us-west-2. We have a 100 Gbps inter-region link.
The training run takes about 6 hours. Should we transfer the data
to us-west-2 first, or spin up compute in us-east-1?"
""")
return
# ─── ACT II: PREDICTION LOCK ─────────────────────────────────────────────────
@app.cell(hide_code=True)
def _(mo):
mo.md("""
#### Your Prediction
*Commit to an estimate before using the calculator.*
You have a 50 TB dataset in AWS us-east-1. Training GPUs are in us-west-2.
The inter-region link runs at 100 Gbps. How long does the data transfer take?
""")
return
@app.cell(hide_code=True)
def _(mo):
act2_prediction = mo.ui.radio(
options={
"A) About 1 minute — 100 Gbps is very fast": "A",
"B) About 67 minutes — 100 Gbps = 12.5 GB/s, 50 TB ÷ 12.5 GB/s ≈ 67 min": "B",
"C) About 11 hours — the practical throughput is much lower than the rated speed": "C",
"D) About 4.6 days — transfer overhead and routing make 100 Gbps unusable": "D",
},
label="Transfer time for 50 TB over a 100 Gbps inter-region link:",
)
act2_prediction
return (act2_prediction,)
@app.cell(hide_code=True)
def _(mo, act2_prediction):
mo.stop(
act2_prediction.value is None,
mo.callout(mo.md("Select your transfer time estimate to unlock the data gravity calculator."), kind="warn"),
)
mo.callout(
mo.md(
f"**Prediction locked: {act2_prediction.value[:2]}** "
"Now use the calculator to determine whether your estimate was correct — "
"and more importantly, whether transferring is the right decision at all."
),
kind="info",
)
return
# ─── ACT II: INSTRUMENTS ─────────────────────────────────────────────────────
@app.cell(hide_code=True)
def _(mo):
mo.md("""
#### The Data Gravity Calculator
Adjust the dataset size and network parameters to see when data transfer
exceeds your training time budget — and when it becomes cheaper to move
the compute instead of the data.
""")
return
@app.cell(hide_code=True)
def _(mo):
act2_dataset_gb = mo.ui.slider(
start=1, stop=6,
value=4, # log10(50 TB in GB) = log10(51200) ≈ 4.7, use 4 as default (10 TB)
step=1,
label="Dataset size (log10 GB) — 10¹ = 10 GB, 10⁶ = 1 PB",
show_value=True,
)
act2_network_bw = mo.ui.dropdown(
options={
"1 Gbps Ethernet (0.125 GB/s)": "1g",
"10 Gbps Ethernet (1.25 GB/s)": "10g",
"100 Gbps Ethernet (12.5 GB/s)": "100g",
},
value="100 Gbps Ethernet (12.5 GB/s)",
label="Network bandwidth",
)
act2_training_hours = mo.ui.slider(
start=1, stop=72, value=6, step=1,
label="Training time budget (hours)",
show_value=True,
)
mo.hstack([
act2_dataset_gb,
act2_network_bw,
act2_training_hours,
], gap=2, justify="start")
return (act2_dataset_gb, act2_network_bw, act2_training_hours)
@app.cell(hide_code=True)
def _(mo, act2_dataset_gb, act2_network_bw, act2_training_hours, context_toggle, go, apply_plotly_theme, COLORS, math):
# ── Physics engine ────────────────────────────────────────────────────────
# Source: DataGravity class in data_engineering.qmd
# T_transfer = D_vol / BW (from @sec-data-engineering-data-gravity-adcb)
# AWS egress: $0.08/GB (DataGravity.egress_cost_per_gb_str)
_dataset_gb_val = 10 ** act2_dataset_gb.value
# Network bandwidth (GB/s) by tier
_net_bw_map = {
"1g": 0.125, # 1 Gbps = 0.125 GB/s
"10g": 1.25, # 10 Gbps = 1.25 GB/s
"100g": 12.5, # 100 Gbps = 12.5 GB/s
}
_net_bw_gbs = _net_bw_map[act2_network_bw.value]
# Context-dependent cost: cloud has egress, edge has none
_ctx = context_toggle.value
if _ctx == "cloud":
_egress_cost_per_gb = 0.08 # AWS egress, DataGravity.egress_cost_per_gb_str
else:
_egress_cost_per_gb = 0.00 # local LAN, no egress fee
# Transfer calculations
_transfer_seconds = _dataset_gb_val / _net_bw_gbs
_transfer_hours = _transfer_seconds / 3600
_transfer_cost = _dataset_gb_val * _egress_cost_per_gb
# Training budget
_training_hours = act2_training_hours.value
# Failure state: transfer > training budget
_transfer_exceeds_training = _transfer_hours > _training_hours
# Decision metric: compare transfer cost to compute-in-place cost
# Approximate: spinning up equivalent compute in source region
# A100 spot price ~$2.50/GPU-hr, 8 GPUs
_compute_spot_cost_per_hour = 20.0 # $20/hr for 8× A100 spot
_compute_cost_to_stay = _compute_spot_cost_per_hour * _training_hours
_total_transfer_cost = _transfer_cost # (ignoring compute differential for clarity)
# Format dataset size for display
if _dataset_gb_val >= 1e6:
_ds_label = f"{_dataset_gb_val/1e6:.1f} PB"
elif _dataset_gb_val >= 1e3:
_ds_label = f"{_dataset_gb_val/1e3:.1f} TB"
else:
_ds_label = f"{_dataset_gb_val:.0f} GB"
# Format transfer time for display
if _transfer_hours >= 24:
_time_label = f"{_transfer_hours/24:.1f} days"
elif _transfer_hours >= 1:
_time_label = f"{_transfer_hours:.1f} hours"
else:
_time_label = f"{_transfer_hours * 60:.0f} minutes"
# ── Bar chart: transfer time vs training budget ───────────────────────────
_transfer_color = COLORS["RedLine"] if _transfer_exceeds_training else COLORS["GreenLine"]
_train_color = COLORS["BlueLine"]
_fig2 = go.Figure()
_fig2.add_trace(go.Bar(
name="Data Transfer",
x=["Time Comparison (hours)"],
y=[_transfer_hours],
marker_color=_transfer_color,
text=[f"{_transfer_hours:.1f}h"],
textposition="outside",
width=0.3,
))
_fig2.add_trace(go.Bar(
name="Training Budget",
x=["Time Comparison (hours)"],
y=[_training_hours],
marker_color=_train_color,
text=[f"{_training_hours}h budget"],
textposition="outside",
width=0.3,
))
_fig2.update_layout(
barmode="group",
height=280,
yaxis_title="Hours",
showlegend=True,
margin=dict(l=40, r=20, t=40, b=40),
title=dict(text=f"Transfer vs Training: {_ds_label} over {act2_network_bw.value[:6]}", font=dict(size=13)),
)
apply_plotly_theme(_fig2)
# ── Metric cards ──────────────────────────────────────────────────────────
_cost_color = COLORS["RedLine"] if _transfer_cost > _compute_cost_to_stay else COLORS["GreenLine"]
_cards2_html = f"""
Transfer Time
{_time_label}
{_ds_label} at {_net_bw_gbs} GB/s
Egress Cost
${_transfer_cost:,.0f}
@ ${_egress_cost_per_gb:.2f}/GB
Compute-in-Place
${_compute_cost_to_stay:,.0f}
8× A100 spot × {_training_hours}h
"""
# ── Physics formula display ───────────────────────────────────────────────
_formula2_md = f"""
**The physics (from @sec-data-engineering-data-gravity-adcb):**
```
T_transfer = D_vol / BW
= {_dataset_gb_val:,.0f} GB / {_net_bw_gbs} GB/s
= {_transfer_seconds:,.0f} s
= {_time_label}
Egress cost = D_vol × $0.08/GB
= {_dataset_gb_val:,.0f} × $0.08
= ${_transfer_cost:,.0f}
```
"""
mo.vstack([
mo.Html(_cards2_html),
mo.as_html(_fig2),
mo.md(_formula2_md),
])
return (
_transfer_exceeds_training,
_transfer_hours,
_training_hours,
_transfer_cost,
_compute_cost_to_stay,
_time_label,
_ds_label,
_transfer_color,
)
# ─── ACT II: FAILURE STATE ───────────────────────────────────────────────────
@app.cell(hide_code=True)
def _(mo, _transfer_exceeds_training, _transfer_hours, _training_hours, _time_label, _ds_label):
if _transfer_exceeds_training:
mo.callout(mo.md(
f"**Data transfer ({_time_label}) exceeds training budget ({_training_hours} hours). "
f"Moving {_ds_label} over the network costs more time than the training run itself.**\n\n"
"This is the **data gravity trap**: at this scale, compute must move to the data. "
"Options: (1) spin up training in the same region as the data, "
"(2) use a Data Lakehouse — run the training job directly on the storage node, "
"or (3) upgrade to a faster network link. "
"Pull the dataset size or training budget slider to find the breakeven point."
), kind="danger")
else:
_ratio = _transfer_hours / _training_hours
if _ratio > 0.5:
mo.callout(mo.md(
f"**Transfer feasible but costly: {_time_label} is {_ratio*100:.0f}% of your "
f"{_training_hours}-hour training budget.**\n\n"
"Data is approaching the gravity threshold. A small increase in dataset size "
"or reduction in training time will trigger the trap. Consider co-locating "
"compute with data as a proactive architectural decision."
), kind="warn")
else:
mo.callout(mo.md(
f"**Transfer is viable: {_time_label} is well within the {_training_hours}-hour budget.**\n\n"
"At this scale, data transfer is not the bottleneck. Data gravity has not yet "
"trapped this workload. Increase the dataset size to find where the physics "
"forces the architectural switch from 'move data' to 'move compute.'"
), kind="success")
return
# ─── ACT II: PREDICTION REVEAL ───────────────────────────────────────────────
@app.cell(hide_code=True)
def _(mo, act2_prediction):
# 50 TB at 100 Gbps: 50,000 GB / 12.5 GB/s = 4,000 s = 66.7 min ≈ 67 min
_actual_minutes = 67
_actual_cost = 50_000 * 0.08 # 50 TB × $0.08/GB = $4,000
_predicted = {
"A": 1,
"B": 67,
"C": 660,
"D": 6624,
}[act2_prediction.value]
_ratio = _actual_minutes / _predicted if _predicted > 0 else float("inf")
_is_correct = act2_prediction.value == "B"
if _is_correct:
mo.callout(mo.md(
f"**Correct. You predicted ~{_predicted} minutes. The actual transfer time is ~{_actual_minutes} minutes.**\n\n"
f"50 TB ÷ 12.5 GB/s = 4,000 s = **{_actual_minutes} minutes**. "
f"Plus the egress cost: 50,000 GB × $0.08 = **$4,000**. "
"The transfer is feasible for a 6-hour training run — but $4,000 in egress "
"may exceed the cost of spinning up equivalent compute in us-east-1."
), kind="success")
elif act2_prediction.value == "A":
mo.callout(mo.md(
f"**You were off by {_ratio:.0f}×. You predicted ~{_predicted} minute. "
f"The actual transfer time is ~{_actual_minutes} minutes.**\n\n"
"100 Gbps sounds fast, but it equals only 12.5 GB/s. Dividing 50 TB "
"(= 50,000 GB) by 12.5 GB/s gives 4,000 seconds = **67 minutes**. "
"Sustained 100 Gbps is rare in practice; real transfers are slower."
), kind="warn")
elif act2_prediction.value == "C":
mo.callout(mo.md(
f"**You were off by {1/_ratio:.1f}×. You predicted ~{_predicted} minutes. "
f"The actual transfer time is ~{_actual_minutes} minutes.**\n\n"
"The calculation: 50 TB ÷ 12.5 GB/s = 4,000 s = **67 minutes**. "
"A sustained 100 Gbps connection is fast enough to transfer 50 TB in "
"just over an hour. The bottleneck becomes cost ($4,000 egress), not time."
), kind="warn")
else:
mo.callout(mo.md(
f"**You were off by {1/_ratio:.1f}×. You predicted ~{_predicted/60:.1f} hours. "
f"The actual transfer time is ~{_actual_minutes} minutes.**\n\n"
"At 100 Gbps = 12.5 GB/s: 50 TB ÷ 12.5 GB/s = 4,000 s = **67 minutes**. "
"The 100 Gbps link is genuinely fast. Data gravity at this scale is "
"primarily an *economic* problem (egress cost) rather than a time problem."
), kind="warn")
return
# ─── ACT II: DECISION COMPARISON ─────────────────────────────────────────────
@app.cell(hide_code=True)
def _(mo, _transfer_cost, _compute_cost_to_stay, _time_label, _ds_label):
_decision = "Move compute to data" if _transfer_cost > _compute_cost_to_stay else "Move data to compute"
_decision_color = "#CB202D" if _transfer_cost > _compute_cost_to_stay else "#008F45"
mo.Html(f"""
Architecture Decision
Transfer data ({_ds_label}) to compute
{_time_label} + ${_transfer_cost:,.0f} egress
vs
Spin up compute where data lives
$0 transfer + ${_compute_cost_to_stay:,.0f} compute
{_decision}
""")
return
# ─── ACT II: REFLECTION ──────────────────────────────────────────────────────
@app.cell(hide_code=True)
def _(mo):
mo.md("""
#### Reflection
You have seen data gravity in action. Now identify the principle it demonstrates.
""")
return
@app.cell(hide_code=True)
def _(mo):
act2_reflection = mo.ui.radio(
options={
"A) Large datasets are heavy to store and require expensive hardware": "A",
"B) Compute naturally migrates toward large datasets because transfer cost exceeds compute cost": "B",
"C) Data should always be compressed before training to reduce transfer time": "C",
"D) Cloud infrastructure is always faster than edge for data-intensive workloads": "D",
},
label="Data gravity means:",
)
act2_reflection
return (act2_reflection,)
@app.cell(hide_code=True)
def _(mo, act2_reflection):
mo.stop(
act2_reflection.value is None,
mo.callout(mo.md("Select your answer to continue."), kind="warn"),
)
if act2_reflection.value == "B":
mo.callout(mo.md(
"**Correct.** Data gravity (from @sec-data-engineering-data-gravity-adcb) is "
"the economic and physical pressure that pushes compute toward large datasets. "
"When T_transfer = D_vol / BW exceeds the training time budget, or when egress "
"cost exceeds the cost of running compute in the data's region, it becomes "
"cheaper to bring the compute to the data. This explains the architecture of "
"Data Lakehouses — processing engines (Spark, Presto, training jobs) run "
"directly on the storage nodes where the data already resides."
), kind="success")
elif act2_reflection.value == "A":
mo.callout(mo.md(
"**Incorrect.** Data gravity is not about storage weight or hardware cost. "
"It is about *movement cost*: the time and money required to transfer data "
"across a network. A 1 PB dataset sitting in one region is not a gravity "
"problem — the gravity problem begins when you try to move it somewhere else. "
"The correct answer is B: compute migrates toward data when transfer cost "
"exceeds the cost of co-locating compute."
), kind="warn")
elif act2_reflection.value == "C":
mo.callout(mo.md(
"**Incorrect.** Compression reduces the bytes to transfer, which can reduce "
"transfer time (T = D_vol / BW — smaller D_vol, shorter T). But data gravity "
"is not the observation that compression helps. It is the observation that "
"beyond a certain scale, no amount of compression makes transfer viable — "
"compute must move to the data instead. The correct answer is B."
), kind="warn")
else:
mo.callout(mo.md(
"**Incorrect.** Cloud infrastructure is not universally faster for data workloads. "
"The key insight of data gravity is that *location relative to the data* determines "
"which infrastructure is faster. An edge device processing data locally avoids all "
"egress costs and network latency entirely. Cloud is faster only when the compute "
"is co-located with the data (same region). The correct answer is B."
), kind="warn")
return
# ─── ACT II: MATHPEEK ────────────────────────────────────────────────────────
@app.cell(hide_code=True)
def _(mo):
mo.accordion({
"The governing equation — Data Gravity": mo.md("""
**Data Gravity** (from @sec-data-engineering-data-gravity-adcb):
$$T_{transfer} = \\frac{D_{vol}}{BW}$$
Where:
- **D_vol** — dataset volume in GB
- **BW** — network bandwidth in GB/s
- **T_transfer** — total transfer time in seconds
**The architectural decision rule** (from the DataGravity notebook):
> *If T_transfer > T_training → move compute to data*
> *If T_transfer < T_training → move data to compute*
**The economic rule** includes egress cost:
$$Cost_{transfer} = D_{vol} \\times \\$0.08/\\text{GB}$$
At petabyte scale: $10^6 \\text{ GB} \\times \\$0.08 = \\$80{,}000$ egress alone.
**Numerical example (50 TB at 100 Gbps):**
```
T_transfer = 50,000 GB / 12.5 GB/s = 4,000 s ≈ 67 minutes
Cost = 50,000 GB × $0.08 = $4,000
Rule: If training takes < 67 min → the transfer takes longer than the job.
Move compute to data.
```
**The rule of thumb** (from the DataGravity notebook in data_engineering.qmd):
- *Petabyte scale:* Code moves to Data (Data Lakehouse, in-place compute)
- *Gigabyte scale:* Data moves to Code (standard transfer is viable)
"""),
})
return
# ─── DESIGN LEDGER SAVE + HUD ─────────────────────────────────────────────────
@app.cell(hide_code=True)
def _(mo, ledger, context_toggle, act1_prediction, act1_reflection,
act2_prediction, act2_reflection, _gpu_util, _transfer_exceeds_training,
act1_storage_type, COLORS):
# Save chapter results to Design Ledger
_act1_correct = act1_prediction.value == "B" if act1_prediction.value else False
_act1_refl_correct = act1_reflection.value == "C" if act1_reflection.value else False
_act2_correct = act2_prediction.value == "B" if act2_prediction.value else False
_act2_refl_correct = act2_reflection.value == "B" if act2_reflection.value else False
ledger.save(
chapter=4,
design={
"context": context_toggle.value,
"storage_type_chosen": act1_storage_type.value if act1_storage_type.value else "hdd",
"gpu_util_at_start": round(_gpu_util, 1),
"data_gravity_triggered": bool(_transfer_exceeds_training),
"act1_correct": _act1_correct,
"act1_reflection_correct": _act1_refl_correct,
"act2_correct": _act2_correct,
"act2_reflection_correct": _act2_refl_correct,
},
)
# HUD footer
_ctx_label = "Cloud — Multi-Region" if context_toggle.value == "cloud" else "Edge — Local"
_act1_status = "correct" if _act1_correct else ("pending" if act1_prediction.value is None else "incorrect")
_act2_status = "correct" if _act2_correct else ("pending" if act2_prediction.value is None else "incorrect")
_gravity_status = "triggered" if _transfer_exceeds_training else "not triggered"
def _status_color(s):
return {"correct": "#4ade80", "pending": "#94a3b8", "incorrect": "#f87171", "triggered": "#f87171", "not triggered": "#4ade80"}.get(s, "#94a3b8")
mo.Html(f"""
DESIGN LEDGER · CH04
|
Context:
{_ctx_label}
Act I prediction:
{_act1_status}
Act II prediction:
{_act2_status}
Gravity trap:
{_gravity_status}
GPU util (initial):
{_gpu_util:.0f}%
""")
return
if __name__ == "__main__":
app.run()