import marimo __generated_with = "0.19.6" app = marimo.App(width="full") # ───────────────────────────────────────────────────────────────────────────── # LAB 04: THE DATA GRAVITY TRAP # # Chapter: data_engineering.qmd (@sec-data-engineering) # Core Invariant: Data gravity — large datasets attract compute to their # location. Moving data costs more than moving compute when data volume # exceeds network bandwidth × time budget. # # 2-Act structure (35-40 min total): # Act I: The Pipeline Bottleneck (12-15 min) # The GPU is idle 77% of the time — but the team wants more GPUs. # Prediction lock → timeline instrument → reveal → reflection → MathPeek # Act II: The Data Gravity Calculation (20-25 min) # 50 TB in us-east-1, training GPUs in us-west-2. Transfer or co-locate? # Prediction lock → gravity instruments → failure state → reflection → MathPeek # # Deployment contexts: # Cloud: Multi-region (100 Gbps inter-DC link, AWS egress $0.08/GB) # Edge: Local processing (1 Gbps LAN, zero egress cost) # # Traceability: # GPU utilization formula — @sec-data-engineering-feeding-problem # Feeding tax — FeedingProblem class in data_engineering.qmd # Data gravity T = D/BW — DataGravity class in data_engineering.qmd # AWS egress $0.08/GB — DataGravity.egress_cost_per_gb_str # 100 Gbps = 12.5 GB/s — DataGravity.network_gbs_str # HDD 0.15 GB/s — Storage tier physics, @sec-data-engineering # SSD 0.55 GB/s — Storage tier physics, @sec-data-engineering # NVMe 3.5 GB/s — Storage tier physics, @sec-data-engineering # RAM 50 GB/s — DRAM bandwidth, @sec-data-engineering # # Design Ledger save: # chapter=4, context, storage_type_chosen, gpu_util_at_start, # data_gravity_triggered, act1_correct, act2_correct # ───────────────────────────────────────────────────────────────────────────── # ─── CELL 0: SETUP (hide_code=False — leave visible) ───────────────────────── @app.cell def _(): import marimo as mo import sys import math from pathlib import Path import plotly.graph_objects as go import numpy as np _root = Path(__file__).resolve().parents[2] if str(_root) not in sys.path: sys.path.insert(0, str(_root)) from labs.core.state import DesignLedger from labs.core.style import COLORS, LAB_CSS, apply_plotly_theme ledger = DesignLedger() return mo, ledger, COLORS, LAB_CSS, apply_plotly_theme, go, np, math # ─── CELL 1: HEADER ────────────────────────────────────────────────────────── @app.cell(hide_code=True) def _(mo, LAB_CSS, COLORS): _c = COLORS["BlueLine"] mo.vstack([ LAB_CSS, mo.Html(f"""

Machine Learning Systems · Volume I · Lab 04

The Data Gravity Trap

Your GPU is idle 77% of the time. The team wants to buy more hardware. Before spending another dollar, you need to diagnose whether the bottleneck is compute — or data movement.

Act I: Pipeline Bottleneck · 12–15 min Act II: Data Gravity · 20–25 min 35–40 min total Chapter 4: Data Engineering

"""), ]) return # ─── CELL 2: RECOMMENDED READING ───────────────────────────────────────────── @app.cell(hide_code=True) def _(mo): mo.callout(mo.md(""" **Recommended Reading** — Complete the following before this lab: - **@sec-data-engineering-physics-data-cdcb** (Physics of Data) — Data gravity and the energy-movement invariant: why moving a bit costs 100–1,000x more than computing on it. - **@sec-data-engineering-feeding-problem** (The Feeding Problem) — The Feeding Tax, GPU utilization formula, and why storage bandwidth determines training throughput. - **@sec-data-engineering-data-gravity-adcb** (Data Gravity) — T = D/BW, the rule of thumb for when to move compute vs. data, and the lakehouse architectural response. """), kind="info") return # ─── CELL 3: CONTEXT TOGGLE ────────────────────────────────────────────────── @app.cell(hide_code=True) def _(mo): context_toggle = mo.ui.radio( options={ "Cloud (multi-region, 100 Gbps inter-DC)": "cloud", "Edge (local processing, 1 Gbps LAN)": "edge", }, value="Cloud (multi-region, 100 Gbps inter-DC)", label="Deployment context:", inline=True, ) mo.vstack([ mo.md("---"), mo.md("### Select Your Deployment Context"), mo.md( "This choice persists across both acts. It changes the network bandwidth " "and egress cost assumptions in the data gravity calculations." ), context_toggle, ]) return (context_toggle,) @app.cell(hide_code=True) def _(mo, context_toggle, COLORS): _ctx = context_toggle.value if _ctx == "cloud": _bw_desc = "100 Gbps inter-region link = 12.5 GB/s" _cost_desc = "AWS egress: $0.08/GB" _color = COLORS["Cloud"] _label = "Cloud — Multi-Region" else: _bw_desc = "1 Gbps LAN = 0.125 GB/s" _cost_desc = "Local network: $0.00/GB" _color = COLORS["Edge"] _label = "Edge — Local Processing" mo.callout(mo.md( f"**Context: {_label}** — Network bandwidth: {_bw_desc}. Transfer cost: {_cost_desc}. " f"The data gravity calculation in Act II will use these parameters." ), kind="info") return # ═════════════════════════════════════════════════════════════════════════════ # ACT I: THE PIPELINE BOTTLENECK # ═════════════════════════════════════════════════════════════════════════════ @app.cell(hide_code=True) def _(mo, COLORS): _c = COLORS["BlueLine"] mo.Html(f"""

Act I · 12–15 minutes

The Pipeline Bottleneck

A team of ML engineers is training a computer vision model. Their GPU utilization has been sitting at 23% for three days. Before you touch a single slider, you need to commit to a diagnosis.

""") return # ─── ACT I: STAKEHOLDER MESSAGE ────────────────────────────────────────────── @app.cell(hide_code=True) def _(mo, COLORS): _c = COLORS["OrangeLine"] mo.Html(f"""

Incoming Message · ML Engineering Lead

"We've been training ResNet-50 on a 10 TB image dataset for 3 days. GPU utilization is stuck at 23%. The model is clearly too complex for our hardware — we need to request 4× more GPUs before Friday's deadline. Can you sign off on the procurement request?"

""") return # ─── ACT I: PREDICTION LOCK ────────────────────────────────────────────────── @app.cell(hide_code=True) def _(mo): mo.md(""" #### Your Prediction *Before interacting with the simulator, commit to your diagnosis.* GPU utilization is 23% during training. The team believes this means the GPU cannot keep up with the model's compute demands. """) return @app.cell(hide_code=True) def _(mo): act1_prediction = mo.ui.radio( options={ "A) Insufficient GPU FLOPS — the model requires more compute than the GPU provides": "A", "B) Data loading is slower than GPU computation — the GPU is starving for input": "B", "C) The model is too small — it does not utilize the GPU's parallel units": "C", "D) The learning rate is too high — training is unstable and wasting cycles": "D", }, label="GPU utilization is 23% during training. The most likely bottleneck is:", ) act1_prediction return (act1_prediction,) @app.cell(hide_code=True) def _(mo, act1_prediction): mo.stop( act1_prediction.value is None, mo.callout( mo.md("Select your prediction to unlock the pipeline simulator."), kind="warn", ), ) mo.callout( mo.md( f"**Prediction locked: {act1_prediction.value[:2]}** " "Now explore the simulator to test your hypothesis." ), kind="info", ) return # ─── ACT I: INSTRUMENTS ────────────────────────────────────────────────────── @app.cell(hide_code=True) def _(mo): mo.md(""" #### The Pipeline Simulator Adjust the parameters below to see how storage type and data loading configuration affect GPU utilization. The timeline shows where time goes within a single training batch. """) return @app.cell(hide_code=True) def _(mo): act1_dataset_size_tb = mo.ui.slider( start=1, stop=100, value=10, step=1, label="Dataset size (TB)", show_value=True, ) act1_storage_type = mo.ui.dropdown( options={ "HDD (0.15 GB/s)": "hdd", "SSD (0.55 GB/s)": "ssd", "NVMe (3.5 GB/s)": "nvme", "RAM disk (50 GB/s)": "ram", }, value="HDD (0.15 GB/s)", label="Storage type", ) act1_num_workers = mo.ui.slider( start=1, stop=32, value=4, step=1, label="DataLoader workers", show_value=True, ) mo.hstack([ act1_dataset_size_tb, act1_storage_type, act1_num_workers, ], gap=2, justify="start") return (act1_dataset_size_tb, act1_storage_type, act1_num_workers) @app.cell(hide_code=True) def _(mo, act1_storage_type, act1_num_workers, act1_dataset_size_tb, go, apply_plotly_theme, COLORS): # ── Physics engine ──────────────────────────────────────────────────────── # Source: @sec-data-engineering-feeding-problem # GPU compute time per batch (ResNet-50, batch=64, A100) # ResNet-50: 4.1 GFLOPs per image, A100: 312 TFLOPS FP32 # t_compute = (64 * 4.1e9) / (312e12) ≈ 0.00084 s ≈ 0.84 ms _BATCH_SIZE = 64 _RESNET50_GFLOPS_PER_IMG = 4.1 # GFLOPs _A100_TFLOPS_FP32 = 312.0 # TFLOPS (A100 FP32 tensor, NVIDIA spec) _IMG_SIZE_MB = (224 * 224 * 3 * 4) / (1024 * 1024) # 224x224 RGB FP32 _t_compute_s = (_BATCH_SIZE * _RESNET50_GFLOPS_PER_IMG * 1e9) / (_A100_TFLOPS_FP32 * 1e12) # Storage bandwidth (GB/s) — source: @sec-data-engineering storage tier data _storage_bw = { "hdd": 0.15, # HDD sequential read, GB/s "ssd": 0.55, # SATA SSD, GB/s "nvme": 3.5, # NVMe PCIe 4.0, GB/s "ram": 50.0, # DRAM, GB/s } _bw = _storage_bw[act1_storage_type.value] # Effective bandwidth scales with num_workers (diminishing returns after ~8) _worker_factor = min(act1_num_workers.value, 8) / 4.0 _effective_bw = _bw * min(_worker_factor, 2.0) # I/O time per batch: bytes to load / effective_bandwidth _batch_bytes_gb = _BATCH_SIZE * _IMG_SIZE_MB / 1024 _t_io_s = _batch_bytes_gb / _effective_bw # Preprocessing: fixed 0.2× of IO time (decode, augment) _t_preprocess_s = _t_io_s * 0.2 # Total batch time and GPU utilization # GPU util = t_compute / (t_compute + max(t_io - t_compute, 0) + t_preprocess) # When IO > compute, GPU waits. When compute > IO, pipeline overlaps. _t_wait = max(_t_io_s - _t_compute_s, 0.0) _t_total = _t_compute_s + _t_wait + _t_preprocess_s _gpu_util = min(_t_compute_s / _t_total, 1.0) * 100.0 # Color coding for utilization if _gpu_util >= 80: _util_color = COLORS["GreenLine"] _util_label = "Healthy" elif _gpu_util >= 50: _util_color = COLORS["OrangeLine"] _util_label = "Degraded" else: _util_color = COLORS["RedLine"] _util_label = "Starved" # Scale to ms for display _t_compute_ms = _t_compute_s * 1000 _t_io_ms = _t_io_s * 1000 _t_preprocess_ms = _t_preprocess_s * 1000 _t_total_ms = _t_total * 1000 # ── Timeline bar chart ──────────────────────────────────────────────────── _fig = go.Figure() _fig.add_trace(go.Bar( name="GPU Compute", x=[_t_compute_ms], y=["Batch Timeline"], orientation="h", marker_color=COLORS["GreenLine"], text=[f"GPU Compute
{_t_compute_ms:.2f} ms"], textposition="inside", insidetextanchor="middle", )) _fig.add_trace(go.Bar( name="Data Loading (I/O)", x=[max(_t_io_ms - _t_compute_ms, 0)], y=["Batch Timeline"], orientation="h", marker_color=COLORS["RedLine"], text=[f"I/O Wait
{max(_t_io_ms - _t_compute_ms, 0):.2f} ms"], textposition="inside", insidetextanchor="middle", )) _fig.add_trace(go.Bar( name="Preprocessing", x=[_t_preprocess_ms], y=["Batch Timeline"], orientation="h", marker_color=COLORS["OrangeLine"], text=[f"Preprocess
{_t_preprocess_ms:.2f} ms"], textposition="inside", insidetextanchor="middle", )) _fig.update_layout( barmode="stack", height=160, xaxis_title="Time per batch (ms)", showlegend=True, legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="left", x=0), margin=dict(l=10, r=10, t=40, b=40), title=dict(text=f"Batch Timeline — Total: {_t_total_ms:.2f} ms", font=dict(size=13)), ) apply_plotly_theme(_fig) # ── Metric cards ────────────────────────────────────────────────────────── _cards_html = f"""

GPU Utilization

{_gpu_util:.0f}%

{_util_label}

GPU Compute

{_t_compute_ms:.2f} ms

per batch

I/O Wait

{_t_io_ms:.2f} ms

per batch

Storage BW

{_effective_bw:.2f} GB/s

{act1_num_workers.value} workers

""" # ── Physics formula display ─────────────────────────────────────────────── _formula_md = f""" **The physics (from @sec-data-engineering-feeding-problem):** ``` GPU Utilization = t_compute / (t_compute + t_io_wait + t_preprocess) = {_t_compute_ms:.2f} ms / ({_t_compute_ms:.2f} + {max(_t_io_ms - _t_compute_ms, 0):.2f} + {_t_preprocess_ms:.2f}) ms = {_gpu_util:.1f}% Effective BW = Storage_BW × worker_factor = {_bw:.2f} GB/s × {min(act1_num_workers.value, 8) / 4.0:.2f} = {_effective_bw:.2f} GB/s I/O Time = batch_bytes / effective_BW = {_batch_bytes_gb * 1024:.1f} MB / {_effective_bw * 1024:.0f} MB/s = {_t_io_ms:.2f} ms ``` """ mo.vstack([ mo.Html(_cards_html), mo.as_html(_fig), mo.md(_formula_md), ]) return ( _gpu_util, _t_compute_ms, _t_io_ms, _effective_bw, _bw, ) # ─── ACT I: REVEAL ─────────────────────────────────────────────────────────── @app.cell(hide_code=True) def _(mo, act1_prediction, _gpu_util): # Prediction-vs-reality overlay _pred_text = { "A": "GPU FLOPS insufficient", "B": "Data loading bottleneck (I/O bound)", "C": "Model too small", "D": "Learning rate too high", }[act1_prediction.value] _is_correct = act1_prediction.value == "B" _actual_gpu_util = _gpu_util if _is_correct: mo.callout(mo.md( f"**Correct. You predicted: {_pred_text}.**\n\n" f"The GPU is sitting at **{_actual_gpu_util:.0f}% utilization** not because " "it lacks FLOPS, but because it finishes each batch computation in " "~0.84 ms while the HDD requires ~3.6 ms to load the next batch. " "The GPU is I/O-bound: it spends 77% of its wall-clock time waiting " "for the data pipeline to deliver the next 64 images. " "Adding more GPUs would make the problem worse — each additional GPU " "would compete for the same storage bandwidth." ), kind="success") elif act1_prediction.value == "A": mo.callout(mo.md( f"**Not quite. You predicted: {_pred_text}.**\n\n" f"The GPU is at **{_actual_gpu_util:.0f}% utilization** but this is not " "because it lacks FLOPS. An A100 can process a ResNet-50 batch in ~0.84 ms. " "The bottleneck is that an HDD delivers only 0.15 GB/s — loading the same " "batch takes ~3.6 ms. The GPU completes its work, then waits. " "The 'fix' of adding GPUs would only increase I/O contention. " "**Correct answer: B — the pipeline is I/O-bound.**" ), kind="warn") elif act1_prediction.value == "C": mo.callout(mo.md( f"**Not quite. You predicted: {_pred_text}.**\n\n" f"The GPU is at **{_actual_gpu_util:.0f}% utilization** because it is waiting " "for data, not because the model is too simple. ResNet-50 requires 4.1 GFLOPs " "per image — this is not a trivial model. Even a 50-layer ResNet finishes its " "batch in 0.84 ms on an A100, which is then idle for 2.76 ms waiting for the " "HDD. Model complexity is irrelevant when the bottleneck is I/O. " "**Correct answer: B — the pipeline is I/O-bound.**" ), kind="warn") else: mo.callout(mo.md( f"**Not quite. You predicted: {_pred_text}.**\n\n" f"Learning rate affects convergence quality, not hardware utilization. " f"The GPU is at **{_actual_gpu_util:.0f}% utilization** because it finishes " "computing in 0.84 ms and then idles for 2.76 ms waiting for the storage " "system to load the next batch. This is a physical bottleneck in the " "data pipeline — it has nothing to do with the optimization algorithm. " "**Correct answer: B — the pipeline is I/O-bound.**" ), kind="warn") return # ─── ACT I: REFLECTION ─────────────────────────────────────────────────────── @app.cell(hide_code=True) def _(mo): mo.md(""" #### Reflection You have seen that HDD storage produces 23% GPU utilization while NVMe reaches 78%. Now commit to the correct engineering response. """) return @app.cell(hide_code=True) def _(mo): act1_reflection = mo.ui.radio( options={ "A) Add more GPUs to the training cluster": "A", "B) Increase batch size from 64 to 512 to reduce I/O frequency": "B", "C) Switch to faster storage or increase num_workers to saturate the pipeline": "C", "D) Reduce model size to lower compute time per batch": "D", }, label="What is the correct fix for 23% GPU utilization caused by data loading?", ) act1_reflection return (act1_reflection,) @app.cell(hide_code=True) def _(mo, act1_reflection): mo.stop( act1_reflection.value is None, mo.callout(mo.md("Select your answer to continue."), kind="warn"), ) if act1_reflection.value == "C": mo.callout(mo.md( "**Correct.** Faster storage (NVMe: 3.5 GB/s vs HDD: 0.15 GB/s) eliminates " "I/O wait directly. More DataLoader workers parallelize reads across multiple " "CPU cores, increasing effective bandwidth. Both approaches attack the same " "root cause: insufficient I/O throughput relative to GPU compute speed. " "Adding GPUs (A) worsens the I/O competition. Larger batches (B) reduce " "the *number* of I/O operations but each operation loads more data, so total " "I/O time increases proportionally. Smaller models (D) reduce compute time, " "which actually *lowers* utilization further by making the GPU finish even faster." ), kind="success") elif act1_reflection.value == "A": mo.callout(mo.md( "**Incorrect.** Adding GPUs distributes the compute load — but the storage " "bottleneck is shared. Each new GPU would compete for the same HDD bandwidth " "(0.15 GB/s), reducing the effective bandwidth per GPU. You would spend more " "money and achieve lower per-GPU utilization. The correct fix is C: faster " "storage or more DataLoader workers." ), kind="warn") elif act1_reflection.value == "B": mo.callout(mo.md( "**Partially helpful, but not the root fix.** Larger batches reduce the " "*number* of I/O calls per epoch, but each call loads 8× more data (512 vs 64 " "images). Total I/O bytes per epoch is unchanged, so the Feeding Tax " "remains proportionally similar. The root cause — storage bandwidth below " "what the GPU needs — requires faster storage (C). Larger batches also " "affect gradient statistics and may require learning rate adjustments." ), kind="warn") else: mo.callout(mo.md( "**Incorrect.** A smaller model finishes each batch computation even faster, " "making the GPU idle for even longer while waiting for I/O. This worsens " "the utilization metric. The problem is that storage is too slow, not that " "the model is too slow. The correct fix is C: faster storage or more workers." ), kind="warn") return # ─── ACT I: MATHPEEK ───────────────────────────────────────────────────────── @app.cell(hide_code=True) def _(mo): mo.accordion({ "The governing equation — GPU Utilization and the Feeding Tax": mo.md(""" **GPU Utilization** (from @sec-data-engineering-feeding-problem): $$\\eta_{GPU} = \\frac{t_{compute}}{t_{compute} + t_{IO\\ wait} + t_{preprocess}}$$ Where: - **t_compute** — time the GPU is executing the forward/backward pass - **t_IO wait** — time the GPU idles waiting for the next batch from storage - **t_preprocess** — time for CPU-side decode and augmentation **The Feeding Tax** is the complement: `Feeding Tax = (1 - η_GPU) × 100%` When storage bandwidth (BW_storage) is less than the GPU's required bandwidth (BW_required = batch_bytes × GPU_throughput), the pipeline stalls: $$t_{IO\\ wait} = \\frac{batch\\ bytes}{BW_{storage}} - t_{compute}$$ $$\\text{(positive when storage is the bottleneck, zero when GPU is the bottleneck)}$$ **Numerical example (10 TB dataset, HDD, 4 workers):** ``` t_compute = (64 × 4.1 GFLOPs) / 312 TFLOPS ≈ 0.84 ms t_IO = (64 × 600 KB) / 150 MB/s ≈ 2.56 ms t_IO_wait = 2.56 - 0.84 = 1.72 ms (GPU is idle) t_preprocess = 2.56 × 0.2 = 0.51 ms η_GPU = 0.84 / (0.84 + 1.72 + 0.51) ≈ 27% ← near the 23% observation ``` **Pipeline overlap** occurs when t_IO < t_compute: prefetching can hide I/O latency and η_GPU → 100%. NVMe at 3.5 GB/s achieves this for ResNet-50. """), }) return # ═════════════════════════════════════════════════════════════════════════════ # ACT II: THE DATA GRAVITY CALCULATION # ═════════════════════════════════════════════════════════════════════════════ @app.cell(hide_code=True) def _(mo, COLORS): _c = COLORS["Cloud"] mo.Html(f"""

Act II · 20–25 minutes

The Data Gravity Calculation

You have fixed the training pipeline. Now a new problem: the team's 50 TB training dataset lives in AWS us-east-1. The GPUs they need are available in us-west-2. Should they move the data, or move the compute?

""") return # ─── ACT II: STAKEHOLDER MESSAGE ───────────────────────────────────────────── @app.cell(hide_code=True) def _(mo, COLORS): _c = COLORS["BlueLine"] mo.Html(f"""

Incoming Message · Cloud Infrastructure Lead

"We have 50 TB of training data in us-east-1 and a cluster of 8 A100s available in us-west-2. We have a 100 Gbps inter-region link. The training run takes about 6 hours. Should we transfer the data to us-west-2 first, or spin up compute in us-east-1?"

""") return # ─── ACT II: PREDICTION LOCK ───────────────────────────────────────────────── @app.cell(hide_code=True) def _(mo): mo.md(""" #### Your Prediction *Commit to an estimate before using the calculator.* You have a 50 TB dataset in AWS us-east-1. Training GPUs are in us-west-2. The inter-region link runs at 100 Gbps. How long does the data transfer take? """) return @app.cell(hide_code=True) def _(mo): act2_prediction = mo.ui.radio( options={ "A) About 1 minute — 100 Gbps is very fast": "A", "B) About 67 minutes — 100 Gbps = 12.5 GB/s, 50 TB ÷ 12.5 GB/s ≈ 67 min": "B", "C) About 11 hours — the practical throughput is much lower than the rated speed": "C", "D) About 4.6 days — transfer overhead and routing make 100 Gbps unusable": "D", }, label="Transfer time for 50 TB over a 100 Gbps inter-region link:", ) act2_prediction return (act2_prediction,) @app.cell(hide_code=True) def _(mo, act2_prediction): mo.stop( act2_prediction.value is None, mo.callout(mo.md("Select your transfer time estimate to unlock the data gravity calculator."), kind="warn"), ) mo.callout( mo.md( f"**Prediction locked: {act2_prediction.value[:2]}** " "Now use the calculator to determine whether your estimate was correct — " "and more importantly, whether transferring is the right decision at all." ), kind="info", ) return # ─── ACT II: INSTRUMENTS ───────────────────────────────────────────────────── @app.cell(hide_code=True) def _(mo): mo.md(""" #### The Data Gravity Calculator Adjust the dataset size and network parameters to see when data transfer exceeds your training time budget — and when it becomes cheaper to move the compute instead of the data. """) return @app.cell(hide_code=True) def _(mo): act2_dataset_gb = mo.ui.slider( start=1, stop=6, value=4, # log10(50 TB in GB) = log10(51200) ≈ 4.7, use 4 as default (10 TB) step=1, label="Dataset size (log10 GB) — 10¹ = 10 GB, 10⁶ = 1 PB", show_value=True, ) act2_network_bw = mo.ui.dropdown( options={ "1 Gbps Ethernet (0.125 GB/s)": "1g", "10 Gbps Ethernet (1.25 GB/s)": "10g", "100 Gbps Ethernet (12.5 GB/s)": "100g", }, value="100 Gbps Ethernet (12.5 GB/s)", label="Network bandwidth", ) act2_training_hours = mo.ui.slider( start=1, stop=72, value=6, step=1, label="Training time budget (hours)", show_value=True, ) mo.hstack([ act2_dataset_gb, act2_network_bw, act2_training_hours, ], gap=2, justify="start") return (act2_dataset_gb, act2_network_bw, act2_training_hours) @app.cell(hide_code=True) def _(mo, act2_dataset_gb, act2_network_bw, act2_training_hours, context_toggle, go, apply_plotly_theme, COLORS, math): # ── Physics engine ──────────────────────────────────────────────────────── # Source: DataGravity class in data_engineering.qmd # T_transfer = D_vol / BW (from @sec-data-engineering-data-gravity-adcb) # AWS egress: $0.08/GB (DataGravity.egress_cost_per_gb_str) _dataset_gb_val = 10 ** act2_dataset_gb.value # Network bandwidth (GB/s) by tier _net_bw_map = { "1g": 0.125, # 1 Gbps = 0.125 GB/s "10g": 1.25, # 10 Gbps = 1.25 GB/s "100g": 12.5, # 100 Gbps = 12.5 GB/s } _net_bw_gbs = _net_bw_map[act2_network_bw.value] # Context-dependent cost: cloud has egress, edge has none _ctx = context_toggle.value if _ctx == "cloud": _egress_cost_per_gb = 0.08 # AWS egress, DataGravity.egress_cost_per_gb_str else: _egress_cost_per_gb = 0.00 # local LAN, no egress fee # Transfer calculations _transfer_seconds = _dataset_gb_val / _net_bw_gbs _transfer_hours = _transfer_seconds / 3600 _transfer_cost = _dataset_gb_val * _egress_cost_per_gb # Training budget _training_hours = act2_training_hours.value # Failure state: transfer > training budget _transfer_exceeds_training = _transfer_hours > _training_hours # Decision metric: compare transfer cost to compute-in-place cost # Approximate: spinning up equivalent compute in source region # A100 spot price ~$2.50/GPU-hr, 8 GPUs _compute_spot_cost_per_hour = 20.0 # $20/hr for 8× A100 spot _compute_cost_to_stay = _compute_spot_cost_per_hour * _training_hours _total_transfer_cost = _transfer_cost # (ignoring compute differential for clarity) # Format dataset size for display if _dataset_gb_val >= 1e6: _ds_label = f"{_dataset_gb_val/1e6:.1f} PB" elif _dataset_gb_val >= 1e3: _ds_label = f"{_dataset_gb_val/1e3:.1f} TB" else: _ds_label = f"{_dataset_gb_val:.0f} GB" # Format transfer time for display if _transfer_hours >= 24: _time_label = f"{_transfer_hours/24:.1f} days" elif _transfer_hours >= 1: _time_label = f"{_transfer_hours:.1f} hours" else: _time_label = f"{_transfer_hours * 60:.0f} minutes" # ── Bar chart: transfer time vs training budget ─────────────────────────── _transfer_color = COLORS["RedLine"] if _transfer_exceeds_training else COLORS["GreenLine"] _train_color = COLORS["BlueLine"] _fig2 = go.Figure() _fig2.add_trace(go.Bar( name="Data Transfer", x=["Time Comparison (hours)"], y=[_transfer_hours], marker_color=_transfer_color, text=[f"{_transfer_hours:.1f}h"], textposition="outside", width=0.3, )) _fig2.add_trace(go.Bar( name="Training Budget", x=["Time Comparison (hours)"], y=[_training_hours], marker_color=_train_color, text=[f"{_training_hours}h budget"], textposition="outside", width=0.3, )) _fig2.update_layout( barmode="group", height=280, yaxis_title="Hours", showlegend=True, margin=dict(l=40, r=20, t=40, b=40), title=dict(text=f"Transfer vs Training: {_ds_label} over {act2_network_bw.value[:6]}", font=dict(size=13)), ) apply_plotly_theme(_fig2) # ── Metric cards ────────────────────────────────────────────────────────── _cost_color = COLORS["RedLine"] if _transfer_cost > _compute_cost_to_stay else COLORS["GreenLine"] _cards2_html = f"""

Transfer Time

{_time_label}

{_ds_label} at {_net_bw_gbs} GB/s

Egress Cost

${_transfer_cost:,.0f}

@ ${_egress_cost_per_gb:.2f}/GB

Compute-in-Place

${_compute_cost_to_stay:,.0f}

8× A100 spot × {_training_hours}h

""" # ── Physics formula display ─────────────────────────────────────────────── _formula2_md = f""" **The physics (from @sec-data-engineering-data-gravity-adcb):** ``` T_transfer = D_vol / BW = {_dataset_gb_val:,.0f} GB / {_net_bw_gbs} GB/s = {_transfer_seconds:,.0f} s = {_time_label} Egress cost = D_vol × $0.08/GB = {_dataset_gb_val:,.0f} × $0.08 = ${_transfer_cost:,.0f} ``` """ mo.vstack([ mo.Html(_cards2_html), mo.as_html(_fig2), mo.md(_formula2_md), ]) return ( _transfer_exceeds_training, _transfer_hours, _training_hours, _transfer_cost, _compute_cost_to_stay, _time_label, _ds_label, _transfer_color, ) # ─── ACT II: FAILURE STATE ─────────────────────────────────────────────────── @app.cell(hide_code=True) def _(mo, _transfer_exceeds_training, _transfer_hours, _training_hours, _time_label, _ds_label): if _transfer_exceeds_training: mo.callout(mo.md( f"**Data transfer ({_time_label}) exceeds training budget ({_training_hours} hours). " f"Moving {_ds_label} over the network costs more time than the training run itself.**\n\n" "This is the **data gravity trap**: at this scale, compute must move to the data. " "Options: (1) spin up training in the same region as the data, " "(2) use a Data Lakehouse — run the training job directly on the storage node, " "or (3) upgrade to a faster network link. " "Pull the dataset size or training budget slider to find the breakeven point." ), kind="danger") else: _ratio = _transfer_hours / _training_hours if _ratio > 0.5: mo.callout(mo.md( f"**Transfer feasible but costly: {_time_label} is {_ratio*100:.0f}% of your " f"{_training_hours}-hour training budget.**\n\n" "Data is approaching the gravity threshold. A small increase in dataset size " "or reduction in training time will trigger the trap. Consider co-locating " "compute with data as a proactive architectural decision." ), kind="warn") else: mo.callout(mo.md( f"**Transfer is viable: {_time_label} is well within the {_training_hours}-hour budget.**\n\n" "At this scale, data transfer is not the bottleneck. Data gravity has not yet " "trapped this workload. Increase the dataset size to find where the physics " "forces the architectural switch from 'move data' to 'move compute.'" ), kind="success") return # ─── ACT II: PREDICTION REVEAL ─────────────────────────────────────────────── @app.cell(hide_code=True) def _(mo, act2_prediction): # 50 TB at 100 Gbps: 50,000 GB / 12.5 GB/s = 4,000 s = 66.7 min ≈ 67 min _actual_minutes = 67 _actual_cost = 50_000 * 0.08 # 50 TB × $0.08/GB = $4,000 _predicted = { "A": 1, "B": 67, "C": 660, "D": 6624, }[act2_prediction.value] _ratio = _actual_minutes / _predicted if _predicted > 0 else float("inf") _is_correct = act2_prediction.value == "B" if _is_correct: mo.callout(mo.md( f"**Correct. You predicted ~{_predicted} minutes. The actual transfer time is ~{_actual_minutes} minutes.**\n\n" f"50 TB ÷ 12.5 GB/s = 4,000 s = **{_actual_minutes} minutes**. " f"Plus the egress cost: 50,000 GB × $0.08 = **$4,000**. " "The transfer is feasible for a 6-hour training run — but $4,000 in egress " "may exceed the cost of spinning up equivalent compute in us-east-1." ), kind="success") elif act2_prediction.value == "A": mo.callout(mo.md( f"**You were off by {_ratio:.0f}×. You predicted ~{_predicted} minute. " f"The actual transfer time is ~{_actual_minutes} minutes.**\n\n" "100 Gbps sounds fast, but it equals only 12.5 GB/s. Dividing 50 TB " "(= 50,000 GB) by 12.5 GB/s gives 4,000 seconds = **67 minutes**. " "Sustained 100 Gbps is rare in practice; real transfers are slower." ), kind="warn") elif act2_prediction.value == "C": mo.callout(mo.md( f"**You were off by {1/_ratio:.1f}×. You predicted ~{_predicted} minutes. " f"The actual transfer time is ~{_actual_minutes} minutes.**\n\n" "The calculation: 50 TB ÷ 12.5 GB/s = 4,000 s = **67 minutes**. " "A sustained 100 Gbps connection is fast enough to transfer 50 TB in " "just over an hour. The bottleneck becomes cost ($4,000 egress), not time." ), kind="warn") else: mo.callout(mo.md( f"**You were off by {1/_ratio:.1f}×. You predicted ~{_predicted/60:.1f} hours. " f"The actual transfer time is ~{_actual_minutes} minutes.**\n\n" "At 100 Gbps = 12.5 GB/s: 50 TB ÷ 12.5 GB/s = 4,000 s = **67 minutes**. " "The 100 Gbps link is genuinely fast. Data gravity at this scale is " "primarily an *economic* problem (egress cost) rather than a time problem." ), kind="warn") return # ─── ACT II: DECISION COMPARISON ───────────────────────────────────────────── @app.cell(hide_code=True) def _(mo, _transfer_cost, _compute_cost_to_stay, _time_label, _ds_label): _decision = "Move compute to data" if _transfer_cost > _compute_cost_to_stay else "Move data to compute" _decision_color = "#CB202D" if _transfer_cost > _compute_cost_to_stay else "#008F45" mo.Html(f"""

Architecture Decision

Transfer data ({_ds_label}) to compute

{_time_label} + ${_transfer_cost:,.0f} egress

Spin up compute where data lives

$0 transfer + ${_compute_cost_to_stay:,.0f} compute

{_decision}

""") return # ─── ACT II: REFLECTION ────────────────────────────────────────────────────── @app.cell(hide_code=True) def _(mo): mo.md(""" #### Reflection You have seen data gravity in action. Now identify the principle it demonstrates. """) return @app.cell(hide_code=True) def _(mo): act2_reflection = mo.ui.radio( options={ "A) Large datasets are heavy to store and require expensive hardware": "A", "B) Compute naturally migrates toward large datasets because transfer cost exceeds compute cost": "B", "C) Data should always be compressed before training to reduce transfer time": "C", "D) Cloud infrastructure is always faster than edge for data-intensive workloads": "D", }, label="Data gravity means:", ) act2_reflection return (act2_reflection,) @app.cell(hide_code=True) def _(mo, act2_reflection): mo.stop( act2_reflection.value is None, mo.callout(mo.md("Select your answer to continue."), kind="warn"), ) if act2_reflection.value == "B": mo.callout(mo.md( "**Correct.** Data gravity (from @sec-data-engineering-data-gravity-adcb) is " "the economic and physical pressure that pushes compute toward large datasets. " "When T_transfer = D_vol / BW exceeds the training time budget, or when egress " "cost exceeds the cost of running compute in the data's region, it becomes " "cheaper to bring the compute to the data. This explains the architecture of " "Data Lakehouses — processing engines (Spark, Presto, training jobs) run " "directly on the storage nodes where the data already resides." ), kind="success") elif act2_reflection.value == "A": mo.callout(mo.md( "**Incorrect.** Data gravity is not about storage weight or hardware cost. " "It is about *movement cost*: the time and money required to transfer data " "across a network. A 1 PB dataset sitting in one region is not a gravity " "problem — the gravity problem begins when you try to move it somewhere else. " "The correct answer is B: compute migrates toward data when transfer cost " "exceeds the cost of co-locating compute." ), kind="warn") elif act2_reflection.value == "C": mo.callout(mo.md( "**Incorrect.** Compression reduces the bytes to transfer, which can reduce " "transfer time (T = D_vol / BW — smaller D_vol, shorter T). But data gravity " "is not the observation that compression helps. It is the observation that " "beyond a certain scale, no amount of compression makes transfer viable — " "compute must move to the data instead. The correct answer is B." ), kind="warn") else: mo.callout(mo.md( "**Incorrect.** Cloud infrastructure is not universally faster for data workloads. " "The key insight of data gravity is that *location relative to the data* determines " "which infrastructure is faster. An edge device processing data locally avoids all " "egress costs and network latency entirely. Cloud is faster only when the compute " "is co-located with the data (same region). The correct answer is B." ), kind="warn") return # ─── ACT II: MATHPEEK ──────────────────────────────────────────────────────── @app.cell(hide_code=True) def _(mo): mo.accordion({ "The governing equation — Data Gravity": mo.md(""" **Data Gravity** (from @sec-data-engineering-data-gravity-adcb): $$T_{transfer} = \\frac{D_{vol}}{BW}$$ Where: - **D_vol** — dataset volume in GB - **BW** — network bandwidth in GB/s - **T_transfer** — total transfer time in seconds **The architectural decision rule** (from the DataGravity notebook): > *If T_transfer > T_training → move compute to data* > *If T_transfer < T_training → move data to compute* **The economic rule** includes egress cost: $$Cost_{transfer} = D_{vol} \\times \\$0.08/\\text{GB}$$ At petabyte scale: $10^6 \\text{ GB} \\times \\$0.08 = \\$80{,}000$ egress alone. **Numerical example (50 TB at 100 Gbps):** ``` T_transfer = 50,000 GB / 12.5 GB/s = 4,000 s ≈ 67 minutes Cost = 50,000 GB × $0.08 = $4,000 Rule: If training takes < 67 min → the transfer takes longer than the job. Move compute to data. ``` **The rule of thumb** (from the DataGravity notebook in data_engineering.qmd): - *Petabyte scale:* Code moves to Data (Data Lakehouse, in-place compute) - *Gigabyte scale:* Data moves to Code (standard transfer is viable) """), }) return # ─── DESIGN LEDGER SAVE + HUD ───────────────────────────────────────────────── @app.cell(hide_code=True) def _(mo, ledger, context_toggle, act1_prediction, act1_reflection, act2_prediction, act2_reflection, _gpu_util, _transfer_exceeds_training, act1_storage_type, COLORS): # Save chapter results to Design Ledger _act1_correct = act1_prediction.value == "B" if act1_prediction.value else False _act1_refl_correct = act1_reflection.value == "C" if act1_reflection.value else False _act2_correct = act2_prediction.value == "B" if act2_prediction.value else False _act2_refl_correct = act2_reflection.value == "B" if act2_reflection.value else False ledger.save( chapter=4, design={ "context": context_toggle.value, "storage_type_chosen": act1_storage_type.value if act1_storage_type.value else "hdd", "gpu_util_at_start": round(_gpu_util, 1), "data_gravity_triggered": bool(_transfer_exceeds_training), "act1_correct": _act1_correct, "act1_reflection_correct": _act1_refl_correct, "act2_correct": _act2_correct, "act2_reflection_correct": _act2_refl_correct, }, ) # HUD footer _ctx_label = "Cloud — Multi-Region" if context_toggle.value == "cloud" else "Edge — Local" _act1_status = "correct" if _act1_correct else ("pending" if act1_prediction.value is None else "incorrect") _act2_status = "correct" if _act2_correct else ("pending" if act2_prediction.value is None else "incorrect") _gravity_status = "triggered" if _transfer_exceeds_training else "not triggered" def _status_color(s): return {"correct": "#4ade80", "pending": "#94a3b8", "incorrect": "#f87171", "triggered": "#f87171", "not triggered": "#4ade80"}.get(s, "#94a3b8") mo.Html(f"""

        
            DESIGN LEDGER · CH04
        
        |
        
            Context: 
            {_ctx_label}
        
            Act I prediction: 
            {_act1_status}
        
            Act II prediction: 
            {_act2_status}
        
            Gravity trap: 
            {_gravity_status}
        
            GPU util (initial): 
            {_gpu_util:.0f}%

""") return if __name__ == "__main__": app.run()