docs: clean up landing page and centralize math foundations

- Elevate 5-Layer Progressive Lowering mental model to architecture.qmd - Clean up landing page copy to be a punchy one-liner - Re-render architecture composition diagram as SVG for reliability - Move math derivations out of tutorials and into math.qmd with citations - Add DGX Spark to Silicon Zoo
2026-04-30 17:48:27 -05:00 · 2026-03-07 18:37:06 -05:00
parent a78f1bd8b0
commit aed43c5b81
36 changed files with 1247 additions and 311 deletions
--- a/mlsysim/core/formulas.py
+++ b/mlsysim/core/formulas.py
@@ -204,6 +204,30 @@ def calc_tree_allreduce_time(message_bytes, n_gpus, bandwidth_bytes_s, latency_s
    return (bw_term + lat_term).to(ureg.second)
 def calc_all_to_all_time(message_bytes, n_gpus, bandwidth_bytes_s, latency_s):
    """
    All-to-All communication time estimate (typical for MoE token routing).
    T = (N-1)/N × M/β + (N-1) × α
    Args:
        message_bytes: Total message size in bytes (M) per node
        n_gpus: Number of GPUs (N)
        bandwidth_bytes_s: Per-link bandwidth in bytes/second (β)
        latency_s: Per-message startup latency in seconds (α)
    Returns:
        Quantity[second]: Estimated All-to-All time
    """
    msg = _ensure_unit(message_bytes, ureg.byte)
    bw  = _ensure_unit(bandwidth_bytes_s, ureg.byte / ureg.second)
    lat = _ensure_unit(latency_s, ureg.second)
    n = n_gpus
    bw_term = (n - 1) / n * msg / bw
    lat_term = (n - 1) * lat
    return (bw_term + lat_term).to(ureg.second)
 def calc_transformer_training_flops(n_params, n_tokens):
    """
    Estimate total training FLOPs for a Transformer model (6PD rule).
@@ -359,20 +383,21 @@ def calc_mtbf_node(gpu_mtbf_h, n_gpus, nic_mtbf_h, n_nics,
    return (1.0 / rate).to(ureg.hour)
-def calc_pipeline_bubble(n_stages, n_microbatches):
+def calc_pipeline_bubble(n_stages, n_microbatches, v_stages=1):
    """
-    Pipeline bubble fraction (GPipe / 1F1B).
+    Pipeline bubble fraction (GPipe / 1F1B / Interleaved 1F1B).
-    bubble = (P - 1) / (P - 1 + M)
+    bubble = (P - 1) / (V * M + P - 1)
    Args:
        n_stages: Number of pipeline stages (P)
        n_microbatches: Number of microbatches (M)
        v_stages: Number of virtual stages per GPU (V, default 1)
    Returns:
        Bubble fraction (0.0 to 1.0)
    """
-    return (n_stages - 1) / (n_stages - 1 + n_microbatches)
+    return (n_stages - 1) / (v_stages * n_microbatches + n_stages - 1)
 def calc_checkpoint_size(n_params, bytes_per_param=16):
--- a/mlsysim/core/solver.py
+++ b/mlsysim/core/solver.py
@@ -6,6 +6,7 @@ from .formulas import (
    calc_ring_allreduce_time, 
    calc_tree_allreduce_time,
    calc_hierarchical_allreduce_time,
    calc_all_to_all_time,
    calc_mtbf_cluster, 
    calc_young_daly_interval, 
    calc_failure_probability,
@@ -64,10 +65,12 @@ class DistributedSolver(BaseSolver):
              efficiency: float = 0.5,
              tp_size: int = 1,
              pp_size: int = 1,
              ep_size: int = 1,
              v_stages: int = 1,
              microbatch_count: int = 1,
              topology_override: Optional[str] = None) -> Dict[str, Any]:
        """
-        Calculates distributed training performance using the 3D Parallelism model.
+        Calculates distributed training performance using the 3D/4D Parallelism model.
        Parameters
        ----------
@@ -87,6 +90,11 @@ class DistributedSolver(BaseSolver):
        pp_size : int
            Pipeline Parallelism degree. Chains model layers across multiple 
            nodes, introducing 'pipeline bubbles' while saving memory.
        ep_size : int
            Expert Parallelism degree for MoE models. Introduces All-to-All
            communication overhead across nodes.
        v_stages : int
            Number of virtual stages for interleaved pipeline schedules.
        microbatch_count : int
            Number of microbatches (M). Increasing M reduces the pipeline 
            bubble but increases synchronization overhead.
@@ -96,15 +104,15 @@ class DistributedSolver(BaseSolver):
        Returns
        -------
        Dict[str, Any]
-            Metrics including DP/TP latency, the Pipeline Bubble penalty, 
+            Metrics including DP/TP/EP latency, the Pipeline Bubble penalty, 
            and the final Scaling Efficiency.
        """
-        # 1. 3D Parallelism Decomposition
+        # 1. 3D/4D Parallelism Decomposition
        n_accelerators = fleet.total_accelerators
-        dp_size = n_accelerators // (tp_size * pp_size)
+        dp_size = n_accelerators // (tp_size * pp_size * ep_size)
        if dp_size < 1:
-            raise ValueError(f"Infeasible 3D Parallelism: TP({tp_size}) * PP({pp_size}) > Total({n_accelerators})")
+            raise ValueError(f"Infeasible 4D Parallelism: TP({tp_size}) * PP({pp_size}) * EP({ep_size}) > Total({n_accelerators})")
        # 2. Single Node Performance (Computation)
        node_perf = Engine.solve(model, fleet.node.accelerator, batch_size=batch_size // dp_size, precision=precision, efficiency=efficiency)
@@ -139,13 +147,25 @@ class DistributedSolver(BaseSolver):
        # TP Communication (Assuming intra-node NVLink)
        t_comm_tp = (message_size / tp_size / fleet.node.intra_node_bw).to("ms") if tp_size > 1 else Q_("0 ms")
        # EP Communication (All-to-All token routing for MoE)
        if ep_size > 1:
            t_comm_ep = calc_all_to_all_time(
                message_bytes=message_size, 
                n_gpus=ep_size, 
                bandwidth_bytes_s=fleet.fabric.bandwidth / fleet.fabric.oversubscription_ratio, 
                latency_s=fleet.fabric.latency or Q_("5 us")
            )
        else:
            t_comm_ep = Q_("0 ms")
        # 4. Pipeline Parallelism (PP) Bubble
        # Source: Narayanan et al. (2019), "PipePipe: Efficient Pipeline Parallelism"
-        bubble_fraction = calc_pipeline_bubble(pp_size, microbatch_count)
+        # Supports interleaved 1F1B schedules via v_stages
        bubble_fraction = calc_pipeline_bubble(pp_size, microbatch_count, v_stages=v_stages)
        t_bubble = (node_perf.latency * bubble_fraction) if pp_size > 1 else Q_("0 ms")
        # 5. Total Latency and Scaling Efficiency
-        total_comm_latency = t_comm_dp + t_comm_tp
+        total_comm_latency = t_comm_dp + t_comm_tp + t_comm_ep
        step_latency_total = node_perf.latency + total_comm_latency + t_bubble
        scaling_efficiency = (node_perf.latency / step_latency_total).magnitude
@@ -154,13 +174,14 @@ class DistributedSolver(BaseSolver):
            "node_performance": node_perf,
            "dp_communication_latency": t_comm_dp,
            "tp_communication_latency": t_comm_tp,
            "ep_communication_latency": t_comm_ep,
            "communication_latency": total_comm_latency, # Backwards compatibility for tests
            "pipeline_bubble_latency": t_bubble,
            "bubble_fraction": bubble_fraction,
            "step_latency_total": step_latency_total,
            "scaling_efficiency": scaling_efficiency,
            "effective_throughput": (n_accelerators * node_perf.throughput * scaling_efficiency),
-            "parallelism": {"dp": dp_size, "tp": tp_size, "pp": pp_size}
+            "parallelism": {"dp": dp_size, "tp": tp_size, "pp": pp_size, "ep": ep_size}
        }
 class ReliabilitySolver(BaseSolver):
--- a/mlsysim/docs/404.qmd
+++ b/mlsysim/docs/404.qmd
@@ -1,12 +1,8 @@
 ---
 title: "Page Not Found"
 sidebar: false
-format:
+page-layout: custom
  html:
    page-layout: custom
    toc: false
 ---
 <div style="min-height: 60vh; display: flex; flex-direction: column; align-items: center; justify-content: center; text-align: center; padding: 4rem 2rem;">
 <div style="font-size: 5rem; font-weight: 900; color: #E2E8F0; letter-spacing: -0.04em; line-height: 1; margin-bottom: 1.5rem;">404</div>
--- a/mlsysim/docs/_quarto.yml
+++ b/mlsysim/docs/_quarto.yml
@@ -103,45 +103,56 @@ website:
    search: true
    collapse-level: 1
    contents:
-      - getting-started.qmd
+      - section: "Welcome"
      - solver-guide.qmd
      - "---"
      - section: "Tutorials"
        contents:
          - getting-started.qmd
          - for-students.qmd
          - for-instructors.qmd
          - for-engineers.qmd
      - section: "Using MLSYSIM"
        contents:
          - solver-guide.qmd
          - tutorials/hello_world.qmd
          - tutorials/sustainability.qmd
          - tutorials/llm_serving.qmd
          - tutorials/distributed.qmd
          - tutorials/sustainability.qmd
-      - section: "Catalogs"
+      - section: "The MLSys Zoo"
        href: zoo/index.qmd
        contents:
          - zoo/hardware.qmd
          - zoo/models.qmd
          - zoo/fleets.qmd
          - zoo/infra.qmd
      - "---"
-      - math.qmd
+      - section: "Foundations"
-      - glossary.qmd
+        contents:
-      - accuracy.qmd
+          - architecture.qmd
-      - "---"
+          - math.qmd
          - glossary.qmd
          - accuracy.qmd
-      - text: "Whitepaper"
+      - section: "About"
-        href: whitepaper.qmd
+        contents:
-      - contributing.qmd
+          - whitepaper.qmd
-      - "---"
+          - contributing.qmd
-      - section: "API"
+      - section: "API Reference"
        href: api/index.qmd
        contents:
-          - api/hardware.qmd
+          - text: "Hardware"
-          - api/models.qmd
+            href: api/hardware.qmd
-          - api/systems.qmd
+          - text: "Models"
-          - api/infra.qmd
+            href: api/models.qmd
-          - api/core.qmd
+          - text: "Systems"
-          - api/core.solver.qmd
+            href: api/systems.qmd
          - text: "Infrastructure"
            href: api/infra.qmd
          - text: "Core"
            href: api/core.qmd
          - text: "Solvers"
            href: api/core.solver.qmd
  # Footer — ecosystem pattern (matches Kits)
  page-footer:
@@ -169,7 +180,7 @@ format:
    respect-user-color-scheme: true
    css: styles/landing.css
    toc: true
-    toc-depth: 3
+    toc-depth: 4
    toc-title: "On this page"
    number-sections: false
    code-copy: true
--- a/mlsysim/docs/accuracy.qmd
+++ b/mlsysim/docs/accuracy.qmd
@@ -2,7 +2,6 @@
 title: "Model Accuracy & Validation"
 subtitle: "How well do MLSYSIM predictions match measured hardware performance?"
 ---
 MLSYSIM is a **first-order analytical model** — it predicts performance from analytical equations,
 not from empirical measurements. This page documents where those predictions are accurate,
 where they diverge, and why.
--- a/mlsysim/docs/api/hardware.qmd
+++ b/mlsysim/docs/api/hardware.qmd
@@ -2,7 +2,6 @@
 title: "hardware"
 subtitle: "Hardware specifications and device registry"
 ---
 ```python
 import mlsysim
 from mlsysim.hardware.types import ComputeCore, MemoryHierarchy, HardwareNode
--- a/mlsysim/docs/api/index.qmd
+++ b/mlsysim/docs/api/index.qmd
@@ -2,7 +2,6 @@
 title: "API Reference"
 subtitle: "The 5-Layer MLSYSIM Stack: from Silicon to Sustainability"
 ---
 MLSYSIM is a pedagogical simulation platform for reasoning about ML systems trade-offs across the full stack. Every number is unit-typed via [Pint](https://pint.readthedocs.io), every specification is sourced from vendor datasheets, and every solver implements a closed-form analytical model -- no black-box benchmarks.
 ## Architecture
--- a/mlsysim/docs/architecture.qmd
+++ b/mlsysim/docs/architecture.qmd
@@ -0,0 +1,88 @@
 ---
 title: "The 5-Layer Architecture"
 subtitle: "The Mental Model of Progressive Lowering"
 ---
 The core philosophy of MLSYSIM is **Progressive Lowering**. Rather than treating machine learning systems as black boxes, MLSYSIM organizes the domain into five composable layers.
 Abstract workload *demand* (Layer A) is progressively mapped onto concrete hardware *supply* (Layers B, C, D) through analytical *solvers* (Layer E). Understanding this stack is the key to mastering both this library and the textbook it accompanies.
 ## The Stack Diagram
 ```{mermaid}
 %%{init: {'theme': 'neutral'}}%%
 %%| fig-cap: "The MLSYSIM 5-Layer Stack. Workloads (demand) are lowered onto Hardware (supply) through Infrastructure and Systems layers. Solvers bridge demand and supply to produce analytical profiles."
 %%| fig-width: 100%
 flowchart TB
    A["<b>Layer A: Workloads (Demand)</b><br/>TransformerWorkload, CNNWorkload<br/><i>Parameters, FLOPs, Arithmetic Intensity</i>"]
    B["<b>Layer B: Hardware (Silicon)</b><br/>HardwareNode, ComputeCore, MemoryHierarchy<br/><i>Peak FLOP/s, Bandwidth, Capacity, TDP</i>"]
    C["<b>Layer C: Infrastructure (Environment)</b><br/>GridProfile, Datacenter<br/><i>Carbon Intensity, PUE, WUE</i>"]
    D["<b>Layer D: Systems (Topology)</b><br/>Node, Fleet, NetworkFabric<br/><i>Topology, Accelerators/Node, Fabric BW</i>"]
    E["<b>Layer E: Solvers (Analysis)</b><br/>SingleNode · Distributed · Serving<br/>Economics · Sustainability · Reliability"]
    F["<b>Results</b><br/>PerformanceProfile"]
    A --> E
    B --> D
    C --> D
    D --> E
    E --> F
 ```
 ---
 ## 1. Layer A: Workloads (Demand)
 A **Workload** is a hardware-agnostic description of computational demand. You don't ask "How fast is Llama-3?", you ask "How many FLOPs and memory bytes does Llama-3 require?"
 In MLSYSIM, `TransformerWorkload` and `CNNWorkload` define these intrinsic properties (parameter count, layer count, sequence length). The crucial step happens when a workload is "lowered" at a specific numerical precision (e.g., FP16 vs INT8). This lowering step determines the **Arithmetic Intensity** (ops/byte) — the ratio that decides whether a model will be compute-bound or memory-bound on physical hardware.
 *See the [Model Zoo](zoo/models.qmd) for vetted workloads.*
 ---
 ## 2. Layer B: Hardware (Supply)
 A **`HardwareNode`** represents a single physical accelerator (like an H100 GPU or an Apple M3 chip). It provides the raw physical supply:
 *   **Compute:** Theoretical peak throughput (TFLOP/s) across different precisions (FP32, FP16, INT8).
 *   **Memory:** High Bandwidth Memory (HBM) capacity and transfer speed (TB/s).
 *   **Power:** Thermal Design Power (TDP).
 Every piece of silicon has a "Ridge Point" (Peak FLOPs / Memory Bandwidth). If your Workload's arithmetic intensity is lower than the hardware's ridge point, you are memory-bound.
 *See the [Silicon Zoo](zoo/hardware.qmd) for vetted hardware specs.*
 ---
 ## 3. Layer C: Infrastructure (Environment)
 Hardware doesn't run in a vacuum; it runs in datacenters plugged into regional power grids. The **`GridProfile`** captures this physical context.
 A 1000-watt GPU running in Quebec (hydroelectric power) vs. Poland (coal power) produces vastly different carbon footprints, despite doing the exact same mathematical operations. This layer introduces Power Usage Effectiveness (PUE) and Carbon Intensity to the analytical model.
 *See the [Infrastructure Zoo](zoo/infra.qmd) for regional grid profiles.*
 ---
 ## 4. Layer D: Systems (Topology)
 You cannot train a 100-Billion parameter model on a single GPU. A **`Fleet`** composes individual `HardwareNode`s into a distributed cluster.
 *   **`Node`:** Groups accelerators within a physical server chassis (e.g., 8x GPUs).
 *   **`NetworkFabric`:** Specifies how servers talk to each other (e.g., 400 Gbps InfiniBand NDR).
 The way you structure this system determines your communication overhead and your scaling efficiency when you apply 3D/4D Parallelism.
 *See the [Fleet Zoo](zoo/fleets.qmd) for production cluster topologies.*
 ---
 ## 5. Layer E: Solvers (Analysis)
 The previous four layers are just static definitions (nouns). **Solvers** are the engines (verbs) that bridge demand and supply to answer specific questions.
 Each solver implements closed-form equations from classic systems literature:
 *   **`SingleNodeSolver`**: Maps Layer A to Layer B using the Roofline model.
 *   **`DistributedSolver`**: Maps Layer A to Layer D using Ring All-Reduce and Pipeline schedules.
 *   **`SustainabilitySolver`**: Maps Layer D to Layer C using energy physics.
 *See the [Solver Guide](solver-guide.qmd) to learn how to apply these engines.*
--- a/mlsysim/docs/contributing.qmd
+++ b/mlsysim/docs/contributing.qmd
@@ -2,7 +2,6 @@
 title: "Contributing to MLSYSIM"
 subtitle: "How to add hardware specs, write tutorials, and grow the MLSys Zoo."
 ---
 MLSYSIM grows stronger with every new hardware spec, tutorial, and bug report. This guide
 explains how to contribute — whether you are a student who found a discrepancy in a spec,
 an instructor who wants to share a teaching scenario, or a practitioner who wants a new
--- a/mlsysim/docs/for-engineers.qmd
+++ b/mlsysim/docs/for-engineers.qmd
@@ -0,0 +1,154 @@
 ---
 title: "For Engineers & Researchers"
 subtitle: "Back-of-envelope estimates before you provision hardware."
 ---
 MLSYSIM gives you quick, type-safe analytical estimates for capacity planning, hardware selection, cost modeling, and sustainability analysis — in seconds, from specifications alone.
 ---
 ## Why Use Analytical Models?
 Before running expensive benchmarks or provisioning cloud instances, you need directional answers:
 - **Will this model fit in GPU memory?** — Check before renting the GPU
 - **What's the expected TTFT for my LLM?** — Estimate before building the serving stack
 - **How many H100s do I actually need?** — Model scaling efficiency before buying the cluster
 - **What will this cost per year?** — TCO analysis before signing the contract
 MLSYSIM answers these in microseconds using first-order equations. It won't replace profiling, but it tells you *where to start profiling*.
 ---
 ## Quick API Usage
 ```python
 import mlsysim
 from mlsysim import Engine, ServingSolver, DistributedSolver
 # Single-node: Is ResNet-50 memory-bound on A100?
 profile = Engine.solve(
    model=mlsysim.Models.ResNet50,
    hardware=mlsysim.Hardware.Cloud.A100,
    batch_size=1, precision="fp16"
 )
 print(f"{profile.bottleneck}, {profile.latency.to('ms'):~.2f}")
 # LLM serving: What's the TTFT for Llama-3.1-70B on H100?
 serving = ServingSolver()
 result = serving.solve(
    model=mlsysim.Models.Language.Llama3_70B,
    hardware=mlsysim.Hardware.Cloud.H100,
    seq_len=4096, batch_size=1
 )
 print(f"TTFT: {result['ttft'].to('ms'):~.1f}")
 print(f"ITL:  {result['itl'].to('ms'):~.2f}")
 print(f"KV-cache: {result['kv_cache_size'].to('GB'):~.1f}")
 ```
 ---
 ## Hardware Sweep Pattern
 Compare devices programmatically instead of reading datasheets:
 ```python
 import mlsysim
 from mlsysim import Engine
 model = mlsysim.Models.ResNet50
 for hw in [mlsysim.Hardware.Cloud.H100,
           mlsysim.Hardware.Cloud.A100,
           mlsysim.Hardware.Cloud.T4,
           mlsysim.Hardware.Edge.JetsonAGX]:
    p = Engine.solve(model=model, hardware=hw, batch_size=32, precision="fp16")
    print(f"{hw.name:20s}  {p.bottleneck:16s}  {p.latency.to('ms'):>8.2f~}  {p.throughput:>8.0f} img/s")
 ```
 ---
 ## Composing Solvers for Real Questions
 The six solvers are designed to chain:
 ### "Can I serve Llama-70B on 4 H100s within budget?"
 ```python
 from mlsysim import ServingSolver, EconomicsSolver
 # Step 1: Does it fit and what's the latency?
 serving = ServingSolver()
 result = serving.solve(
    model=mlsysim.Models.Language.Llama3_70B,
    hardware=mlsysim.Hardware.Cloud.H100,
    seq_len=4096, batch_size=1
 )
 # Step 2: What does that fleet cost?
 econ = EconomicsSolver()
 cost = econ.solve(
    fleet=mlsysim.Systems.Clusters.H100_8,
    duration_days=365,
    kwh_price=0.08
 )
 print(f"Annual TCO: ${cost['total_tco'].magnitude:,.0f}")
 ```
 ### "Where should I train to minimize carbon?"
 ```python
 from mlsysim import SustainabilitySolver
 sustain = SustainabilitySolver()
 for grid in [mlsysim.Infra.Grids.Quebec, mlsysim.Infra.Grids.US_Average,
             mlsysim.Infra.Grids.Poland]:
    r = sustain.solve(
        fleet=mlsysim.Systems.Clusters.H100_256,
        duration_days=30,
        datacenter=grid
    )
    print(f"{grid.name:12s}  {r['carbon_kg'].to('metric_ton'):>8.1f~}")
 ```
 ---
 ## Writing Custom Solvers
 Follow the built-in solver pattern to create your own analysis:
 ```python
 from mlsysim.hardware.types import HardwareNode
 class PowerEfficiencySolver:
    def solve(self, hardware: HardwareNode) -> dict:
        flops_per_watt = hardware.compute.peak_flops / hardware.tdp
        return {
            "device": hardware.name,
            "flops_per_watt": flops_per_watt.to("TFLOPs/s/kW"),
        }
 ```
 See [Extending MLSYSIM](solver-guide.qmd#extending-mlsysim) for the full guide.
 ---
 ## Type Safety
 All quantities are `pint.Quantity` objects. Unit conversions are explicit, and dimensional errors are caught at runtime:
 ```python
 hw = mlsysim.Hardware.Cloud.A100
 hw.compute.peak_flops.to("TFLOPs/s")   # → 312.0 TFLOPs/s
 hw.memory.bandwidth.to("TB/s")          # → 2.0 TB/s
 hw.memory.bandwidth.to("FLOP/s")        # → DimensionalityError ✓
 ```
 ---
 ## Next Steps
 - **[Getting Started](getting-started.qmd)** — Install and run your first analysis
 - **[Solver Guide](solver-guide.qmd)** — Which solver for which question
 - **[MLSys Zoo](zoo/index.qmd)** — Browse all available hardware, model, and infrastructure specs
 - **[API Reference](api/index.qmd)** — Full programmatic API documentation
 - **[Accuracy & Validation](accuracy.qmd)** — How analytical bounds compare to empirical measurements
--- a/mlsysim/docs/for-instructors.qmd
+++ b/mlsysim/docs/for-instructors.qmd
@@ -0,0 +1,96 @@
 ---
 title: "For Instructors"
 subtitle: "Reproducible, hardware-independent exercises for ML systems courses."
 ---
 MLSYSIM provides a framework for assigning analytically grounded problem sets where every answer is deterministic and reproducible — regardless of what hardware your students have access to.
 ---
 ## Why MLSYSIM for Teaching?
 | Challenge | How MLSYSIM Helps |
 |:----------|:------------------|
 | Students lack GPU access | All analysis runs on a laptop — no cloud credits needed |
 | Homework answers vary by hardware | Vetted registry specs produce identical results everywhere |
 | Hard to grade open-ended systems questions | Analytical solvers give deterministic, verifiable outputs |
 | Specifications become stale | Registry updated from official datasheets; one update propagates everywhere |
 | Students memorize without understanding | "Predict first" exercises build genuine intuition |
 ---
 ## Course Integration Patterns
 ### Pattern 1: Textbook Companion
 MLSYSIM maps directly to chapters in the [Machine Learning Systems](https://mlsysbook.ai) textbook. Assign tutorials alongside readings:
 | Week | Textbook Chapter | MLSYSIM Assignment |
 |:-----|:-----------------|:-------------------|
 | 3 | Hardware Acceleration | [Hello World](tutorials/hello_world.qmd) — Roofline analysis, batch size sweep |
 | 5 | Model Serving | [LLM Serving](tutorials/llm_serving.qmd) — TTFT/ITL analysis |
 | 7 | Distributed Training | [Distributed Training](tutorials/distributed.qmd) — 3D parallelism |
 | 9 | Sustainable AI | [Sustainability Lab](tutorials/sustainability.qmd) — Carbon footprint |
 | 11 | Compute Infrastructure | [Solver Guide](solver-guide.qmd) — Composing solvers for TCO analysis |
 ### Pattern 2: Standalone Labs
 Use individual tutorials as self-contained lab assignments in any systems course. Each tutorial includes exercises with clear expected outputs.
 ### Pattern 3: Capstone Projects
 Advanced students can write custom solvers (see [Extending MLSYSIM](solver-guide.qmd#extending-mlsysim)) or compose multiple solvers to answer research-style questions.
 ---
 ## Assignment Ideas
 ### Homework: Hardware Comparison (30 min)
 > Using `Engine.solve()`, compare ResNet-50 inference latency on the A100, H100, and Jetson AGX at batch sizes 1, 32, and 256. For each configuration, state whether the workload is memory-bound or compute-bound and explain why the bottleneck changes.
 ### Lab: Carbon-Aware Training (45 min)
 > Using the SustainabilitySolver, calculate the carbon footprint of training GPT-3 on a 256-GPU H100 cluster in Quebec vs. US Average vs. Poland. Produce a table and a 2-paragraph analysis of why location matters.
 ### Exam Question: Back-of-Envelope
 > The NVIDIA H100 has 1,979 TFLOP/s (FP16) and 3.35 TB/s bandwidth. What is the ridge point in FLOP/Byte? If a model has arithmetic intensity of 50 FLOP/Byte, is it compute-bound or memory-bound? Show your work.
 ---
 ## Reproducibility Guarantee
 All specifications in the [MLSys Zoo](zoo/index.qmd) are:
 - **Sourced** from official manufacturer datasheets and published benchmarks
 - **Typed** with `pint.Quantity` for dimensional correctness
 - **Frozen** per release — `mlsysim==0.1.0` always produces the same answers
 This means your answer key works for every student, every semester.
 ---
 ## Jupyter & Quarto Compatibility
 All tutorials are designed to run in:
 - **Jupyter Notebooks** — Standard `.ipynb` workflow
 - **Quarto documents** — Render to HTML, PDF, or slides with `quarto render`
 - **Google Colab** — `pip install mlsysim` in the first cell, then go
 No GPU runtime required. CPU-only environments work perfectly because MLSYSIM computes from equations, not empirical profiling.
 ---
 ## Getting Started
 1. Point students to the [Getting Started](getting-started.qmd) guide for installation
 2. Assign the [Hello World](tutorials/hello_world.qmd) tutorial as a warmup
 3. Use the [Solver Guide](solver-guide.qmd) to select solvers for your course topics
 4. Browse the [MLSys Zoo](zoo/index.qmd) for available hardware and model specifications
 ---
 ## Next Steps
 - **[Solver Guide](solver-guide.qmd)** — Which solver maps to which topic
 - **[Math Foundations](math.qmd)** — All equations, for your own reference and exam prep
 - **[Accuracy & Validation](accuracy.qmd)** — How close are analytical estimates to empirical results?
 - **[Whitepaper](whitepaper.qmd)** — The academic paper describing MLSYSIM's design and pedagogy
--- a/mlsysim/docs/for-students.qmd
+++ b/mlsysim/docs/for-students.qmd
@@ -0,0 +1,93 @@
 ---
 title: "For Students"
 subtitle: "Build intuition for ML systems — without needing GPU hardware."
 ---
 Whether you're taking your first ML systems course or preparing for industry interviews, MLSYSIM lets you experiment with real hardware specifications and see exactly *why* systems behave the way they do.
 ---
 ## What You'll Learn
 By working through the MLSYSIM tutorials and exercises, you will:
 - **Identify bottlenecks** — Determine whether a workload is memory-bound or compute-bound on any hardware, and understand *why*
 - **Reason quantitatively** — Use real datasheet numbers (not made-up examples) to calculate latency, throughput, and cost
 - **Build systems intuition** — See how batch size, precision, parallelism strategy, and datacenter location each affect performance
 - **Think across the stack** — Connect workload characteristics to hardware specs to infrastructure constraints
 ---
 ## Your Learning Path
 Start at the top and work through in order. Each tutorial builds on the one before it.
 | Step | Tutorial | You'll Learn | Time |
 |:-----|:---------|:-------------|:-----|
 | 1 | [Hello World](tutorials/hello_world.qmd) | The roofline model, memory-bound vs. compute-bound, batch size sweeps | 15 min |
 | 2 | [Sustainability Lab](tutorials/sustainability.qmd) | Energy, carbon footprint, regional grid effects | 20 min |
 | 3 | [LLM Serving](tutorials/llm_serving.qmd) | TTFT vs. ITL, KV-cache pressure, the two phases of LLM inference | 25 min |
 | 4 | [Distributed Training](tutorials/distributed.qmd) | Data/tensor/pipeline parallelism, communication overhead, scaling efficiency | 30 min |
 ::: {.callout-tip}
 ## Predict Before You Compute
 Every tutorial includes "predict first" exercises. Before running code, write down what you expect. This practice builds the mental models that make you effective at systems reasoning.
 :::
 ---
 ## How MLSYSIM Pairs with the Textbook
 MLSYSIM is the companion framework for the [Machine Learning Systems](https://mlsysbook.ai) textbook. Each solver maps to specific chapters:
 | Textbook Topic | MLSYSIM Solver | What It Models |
 |:---------------|:---------------|:---------------|
 | Hardware Acceleration | SingleNodeSolver | Roofline analysis, compute vs. memory bottleneck |
 | Model Serving | ServingSolver | TTFT, ITL, KV-cache memory |
 | Distributed Training | DistributedSolver | 3D parallelism, all-reduce, pipeline bubbles |
 | Compute Infrastructure | EconomicsSolver | CapEx, OpEx, TCO |
 | Sustainable AI | SustainabilitySolver | Energy, carbon, water usage |
 | Fault Tolerance | ReliabilitySolver | MTBF, checkpoint interval |
 Not using the textbook? No problem — MLSYSIM is self-contained. The [Math Foundations](math.qmd) page documents every equation.
 ---
 ## Prerequisites
 - **Python**: Comfortable with functions, loops, and f-strings
 - **Math**: Basic algebra (no calculus required — all solver equations are arithmetic)
 - **ML**: Familiarity with terms like "model parameters," "inference," and "training" (the [Glossary](glossary.qmd) defines everything else)
 No GPU, no cloud account, no special hardware required. Just:
 ```bash
 pip install mlsysim
 ```
 ---
 ## Quick Start
 ```python
 import mlsysim
 from mlsysim import Engine
 # Load a model and hardware from the vetted registry
 model = mlsysim.Models.ResNet50
 gpu   = mlsysim.Hardware.Cloud.A100
 # Solve: is this workload memory-bound or compute-bound?
 profile = Engine.solve(model=model, hardware=gpu, batch_size=1, precision="fp16")
 print(f"Bottleneck: {profile.bottleneck}")   # → Memory Bound
 print(f"Latency:    {profile.latency.to('ms'):~.2f}")
 ```
 ---
 ## Next Steps
 - **[Getting Started](getting-started.qmd)** — Install MLSYSIM and run your first analysis
 - **[Hello World Tutorial](tutorials/hello_world.qmd)** — Your first roofline analysis
 - **[Glossary](glossary.qmd)** — Look up any unfamiliar term
 - **[Math Foundations](math.qmd)** — The equations behind every solver
--- a/mlsysim/docs/getting-started.qmd
+++ b/mlsysim/docs/getting-started.qmd
@@ -2,7 +2,6 @@
 title: "Getting Started"
 subtitle: "Install MLSYSIM and run your first analysis in under 5 minutes."
 ---
 ::: {.callout-note}
 ## Prerequisites
 MLSYSIM assumes basic Python familiarity (variables, functions, `pip install`). No prior ML or hardware knowledge is required. Key concepts like **roofline analysis**, **memory-bound vs. compute-bound**, and **FLOP/s** are explained in context throughout the tutorials. For a full reference of terms, see the [Glossary](glossary.qmd).
--- a/mlsysim/docs/glossary.qmd
+++ b/mlsysim/docs/glossary.qmd
@@ -2,7 +2,6 @@
 title: "Glossary"
 subtitle: "Definitions for every term used in the MLSYSIM documentation."
 ---
 This page defines every technical term used across the MLSYSIM documentation.
 When a term is first used on any page, it either links here or is defined inline.
--- a/mlsysim/docs/index.qmd
+++ b/mlsysim/docs/index.qmd
@@ -2,86 +2,7 @@
 title: "MLSYSIM"
 page-layout: custom
 sidebar: false
 format:
  html:
    toc: false
    include-in-header:
      text: |
        <style>
          .quarto-title, .quarto-title-meta, h1.title, .breadcrumb { display: none !important; }
          #title-block-header { display: none !important; }
        </style>
        <script>
        function copyInstall() {
          navigator.clipboard.writeText('pip install mlsysim').then(function() {
            var btn = document.getElementById('copy-btn');
            var orig = btn.textContent;
            btn.textContent = 'Copied!';
            setTimeout(function() { btn.textContent = orig; }, 2000);
          });
        }
        // Count-up animation for stats
        document.addEventListener('DOMContentLoaded', function() {
          var observer = new IntersectionObserver(function(entries) {
            entries.forEach(function(entry) {
              if (entry.isIntersecting) {
                var nums = entry.target.querySelectorAll('.im-stat-num');
                nums.forEach(function(el) {
                  var text = el.textContent.trim();
                  var suffix = text.replace(/[0-9]/g, '');
                  var target = parseInt(text);
                  if (isNaN(target)) return;
                  var duration = 1200;
                  var start = performance.now();
                  el.textContent = '0' + suffix;
                  function step(now) {
                    var progress = Math.min((now - start) / duration, 1);
                    var eased = 1 - Math.pow(1 - progress, 3);
                    el.textContent = Math.round(target * eased) + suffix;
                    if (progress < 1) requestAnimationFrame(step);
                  }
                  requestAnimationFrame(step);
                });
                observer.unobserve(entry.target);
              }
            });
          }, { threshold: 0.5 });
          var stats = document.querySelector('.im-stats');
          if (stats) observer.observe(stats);
          // Carousel
          var slides = document.querySelectorAll('.im-slide');
          var dots = document.querySelectorAll('.im-dot');
          var current = 0;
          var timer;
          function showSlide(n) {
            slides.forEach(function(s) { s.classList.remove('im-slide-active'); });
            dots.forEach(function(d) { d.classList.remove('im-dot-active'); });
            current = n;
            slides[current].classList.add('im-slide-active');
            dots[current].classList.add('im-dot-active');
          }
          function nextSlide() { showSlide((current + 1) % slides.length); }
          function startTimer() { timer = setInterval(nextSlide, 5000); }
          dots.forEach(function(dot) {
            dot.addEventListener('click', function() {
              clearInterval(timer);
              showSlide(parseInt(this.dataset.slide));
              startTimer();
            });
          });
          if (slides.length > 0) startTimer();
        });
        </script>
 ---
 <!-- ============================================================
     HERO (one cohesive dark section)
     ============================================================ -->
@@ -98,11 +19,18 @@ MLSYSIM
 :::
 ::: {.im-subtitle}
-Predict ML system performance, cost, and carbon from first principles.
+Predict ML system performance, cost, and carbon.<br/>From first principles.
 :::
 <div class="im-stats">
  <div class="im-stat"><span class="im-stat-num">Fundamental</span><span class="im-stat-label">Physics Solvers</span></div>
  <div class="im-stat"><span class="im-stat-num">18+</span><span class="im-stat-label">Vetted Hardware Specs</span></div>
  <div class="im-stat"><span class="im-stat-num">13+</span><span class="im-stat-label">Reference Workloads</span></div>
  <div class="im-stat"><span class="im-stat-num">4</span><span class="im-stat-label">Carbon-Aware Regions</span></div>
 </div>
 ::: {.im-hero-desc}
-Analytical solvers for reasoning about ML workloads, from microcontrollers to thousand-GPU clusters, without provisioning any hardware.
+Reason about ML workloads—from microcontrollers to GPU clusters—without provisioning any hardware.
 :::
 ::: {.im-install}
@@ -124,10 +52,12 @@ Analytical solvers for reasoning about ML workloads, from microcontrollers to th
 <div class="im-hero-inner">
 <div class="im-carousel">
  <div class="im-carousel-track">
    <button class="im-arrow im-arrow-prev" aria-label="Previous slide">&#8249;</button>
    <button class="im-arrow im-arrow-next" aria-label="Next slide">&#8250;</button>
    <div class="im-slide im-slide-active" data-index="0">
      <div class="im-slide-label">Roofline Analysis</div>
      <div class="im-slide-viz">
-        <svg viewBox="0 0 320 120" class="im-roofline-svg">
+        <svg viewBox="0 0 320 130" class="im-roofline-svg">
          <line x1="40" y1="100" x2="300" y2="100" stroke="rgba(148,163,184,0.3)" stroke-width="1"/>
          <line x1="40" y1="20" x2="40" y2="100" stroke="rgba(148,163,184,0.3)" stroke-width="1"/>
          <text x="170" y="115" fill="#64748b" font-size="9" text-anchor="middle">Arithmetic Intensity (FLOP/Byte)</text>
@@ -147,18 +77,21 @@ Analytical solvers for reasoning about ML workloads, from microcontrollers to th
    <div class="im-slide" data-index="1">
      <div class="im-slide-label">Hardware Comparison</div>
      <div class="im-slide-viz">
-        <svg viewBox="0 0 320 120" class="im-bars-svg">
+        <svg viewBox="0 0 320 130" class="im-bars-svg">
          <text x="50" y="22" fill="#94a3b8" font-size="9" text-anchor="end">H100</text>
-          <rect x="55" y="12" width="0" height="14" rx="3" fill="#38bdf8"><animate attributeName="width" from="0" to="150" dur="1.5s" fill="freeze" begin="0s"/></rect>
+          <rect x="55" y="12" width="0" height="14" rx="3" fill="#38bdf8"><animate attributeName="width" from="0" to="200" dur="1.5s" fill="freeze" begin="0s"/></rect>
-          <text x="210" y="23" fill="#94a3b8" font-size="8">990 TFLOP/s</text>
+          <text x="260" y="23" fill="#94a3b8" font-size="8">990 TFLOP/s</text>
          <text x="50" y="47" fill="#94a3b8" font-size="9" text-anchor="end">A100</text>
-          <rect x="55" y="37" width="0" height="14" rx="3" fill="#38bdf8" opacity="0.7"><animate attributeName="width" from="0" to="95" dur="1.5s" fill="freeze" begin="0.1s"/></rect>
+          <rect x="55" y="37" width="0" height="14" rx="3" fill="#38bdf8" opacity="0.7"><animate attributeName="width" from="0" to="120" dur="1.5s" fill="freeze" begin="0.1s"/></rect>
-          <text x="155" y="48" fill="#94a3b8" font-size="8">312 TFLOP/s</text>
+          <text x="180" y="48" fill="#94a3b8" font-size="8">312 TFLOP/s</text>
          <text x="50" y="72" fill="#94a3b8" font-size="9" text-anchor="end">Jetson</text>
-          <rect x="55" y="62" width="0" height="14" rx="3" fill="#38bdf8" opacity="0.4"><animate attributeName="width" from="0" to="8" dur="1.5s" fill="freeze" begin="0.2s"/></rect>
+          <rect x="55" y="62" width="0" height="14" rx="3" fill="#38bdf8" opacity="0.4"><animate attributeName="width" from="0" to="15" dur="1.5s" fill="freeze" begin="0.2s"/></rect>
-          <text x="68" y="73" fill="#94a3b8" font-size="8">25 TFLOP/s</text>
+          <text x="75" y="73" fill="#94a3b8" font-size="8">25 TFLOP/s</text>
          <text x="50" y="97" fill="#94a3b8" font-size="9" text-anchor="end">ESP32</text>
-          <rect x="55" y="87" width="0" height="14" rx="3" fill="#38bdf8" opacity="0.2"><animate attributeName="width" from="0" to="1" dur="1.5s" fill="freeze" begin="0.3s"/></rect>
+          <rect x="55" y="87" width="0" height="14" rx="3" fill="#38bdf8" opacity="0.2"><animate attributeName="width" from="0" to="2" dur="1.5s" fill="freeze" begin="0.3s"/></rect>
          <text x="62" y="98" fill="#94a3b8" font-size="8">0.5 GFLOP/s</text>
        </svg>
      </div>
@@ -167,19 +100,28 @@ Analytical solvers for reasoning about ML workloads, from microcontrollers to th
    <div class="im-slide" data-index="2">
      <div class="im-slide-label">Sustainability Analysis</div>
      <div class="im-slide-viz">
-        <svg viewBox="0 0 320 120" class="im-sustain-svg">
+        <svg viewBox="0 0 320 130" class="im-sustain-svg">
          <line x1="85" y1="10" x2="85" y2="110" stroke="rgba(148,163,184,0.1)" stroke-width="1"/>
          <text x="80" y="22" fill="#94a3b8" font-size="9" text-anchor="end">Quebec</text>
-          <rect x="85" y="12" width="0" height="14" rx="3" fill="#10b981"><animate attributeName="width" from="0" to="12" dur="1.5s" fill="freeze"/></rect>
+          <rect x="85" y="12" width="0" height="14" rx="3" fill="#10b981">
-          <text x="102" y="23" fill="#94a3b8" font-size="8">24 g CO&#x2082;/kWh</text>
+            <animate attributeName="width" from="0" to="10" dur="1.5s" fill="freeze" begin="0s"/>
          </rect>
          <text x="100" y="23" fill="#94a3b8" font-size="8">20 g CO₂/kWh</text>
          <text x="80" y="47" fill="#94a3b8" font-size="9" text-anchor="end">Norway</text>
-          <rect x="85" y="37" width="0" height="14" rx="3" fill="#10b981" opacity="0.8"><animate attributeName="width" from="0" to="16" dur="1.5s" fill="freeze" begin="0.1s"/></rect>
+          <rect x="85" y="37" width="0" height="14" rx="3" fill="#10b981" opacity="0.8">
-          <text x="106" y="48" fill="#94a3b8" font-size="8">29 g CO&#x2082;/kWh</text>
+            <animate attributeName="width" from="0" to="5" dur="1.5s" fill="freeze" begin="0.1s"/>
          </rect>
          <text x="95" y="48" fill="#94a3b8" font-size="8">10 g CO₂/kWh</text>
          <text x="80" y="72" fill="#94a3b8" font-size="9" text-anchor="end">US Avg</text>
-          <rect x="85" y="62" width="0" height="14" rx="3" fill="#f59e0b"><animate attributeName="width" from="0" to="110" dur="1.5s" fill="freeze" begin="0.2s"/></rect>
+          <rect x="85" y="62" width="0" height="14" rx="3" fill="#f59e0b">
-          <text x="200" y="73" fill="#94a3b8" font-size="8">390 g CO&#x2082;/kWh</text>
+            <animate attributeName="width" from="0" to="95" dur="1.5s" fill="freeze" begin="0.2s"/>
          </rect>
          <text x="185" y="73" fill="#94a3b8" font-size="8">390 g CO₂/kWh</text>
          <text x="80" y="97" fill="#94a3b8" font-size="9" text-anchor="end">Poland</text>
-          <rect x="85" y="87" width="0" height="14" rx="3" fill="#ef4444"><animate attributeName="width" from="0" to="200" dur="1.5s" fill="freeze" begin="0.3s"/></rect>
+          <rect x="85" y="87" width="0" height="14" rx="3" fill="#ef4444">
-          <text x="290" y="98" fill="#94a3b8" font-size="8">700+ g CO&#x2082;/kWh</text>
+            <animate attributeName="width" from="0" to="200" dur="1.5s" fill="freeze" begin="0.3s"/>
          </rect>
          <text x="290" y="98" fill="#94a3b8" font-size="8">820 g CO₂/kWh</text>
        </svg>
      </div>
      <div class="im-slide-caption">Same workload, different region. Up to 41x difference in carbon footprint.</div>
@@ -187,37 +129,87 @@ Analytical solvers for reasoning about ML workloads, from microcontrollers to th
    <div class="im-slide" data-index="3">
      <div class="im-slide-label">LLM Serving</div>
      <div class="im-slide-viz">
-        <svg viewBox="0 0 320 120" class="im-serving-svg">
+        <svg viewBox="0 0 320 130" class="im-serving-svg">
          <text x="160" y="15" fill="#94a3b8" font-size="9" text-anchor="middle">Llama-3.1-8B on H100</text>
          <rect x="30" y="30" width="120" height="50" rx="6" fill="rgba(56,189,248,0.1)" stroke="rgba(56,189,248,0.3)" stroke-width="1"/>
-          <text x="90" y="48" fill="#38bdf8" font-size="9" font-weight="bold" text-anchor="middle">Pre-fill</text>
+          <text x="90" y="46" fill="#38bdf8" font-size="9" font-weight="bold" text-anchor="middle">Pre-fill</text>
-          <text x="90" y="62" fill="#7dd3fc" font-size="18" font-weight="bold" text-anchor="middle">4.2 ms</text>
+          <text x="90" y="64" fill="#7dd3fc" font-size="18" font-weight="bold" text-anchor="middle">4.2 ms</text>
-          <text x="90" y="74" fill="#64748b" font-size="7" text-anchor="middle">TTFT (compute-bound)</text>
+          <text x="90" y="76" fill="#64748b" font-size="7" text-anchor="middle">TTFT (compute-bound)</text>
-          <text x="160" y="58" fill="#94a3b8" font-size="14">&#x2192;</text>
+          <text x="160" y="60" fill="#94a3b8" font-size="14">&#x2192;</text>
          <rect x="180" y="30" width="120" height="50" rx="6" fill="rgba(16,185,129,0.1)" stroke="rgba(16,185,129,0.3)" stroke-width="1"/>
-          <text x="240" y="48" fill="#10b981" font-size="9" font-weight="bold" text-anchor="middle">Decode</text>
+          <text x="240" y="46" fill="#10b981" font-size="9" font-weight="bold" text-anchor="middle">Decode</text>
-          <text x="240" y="62" fill="#6ee7b7" font-size="18" font-weight="bold" text-anchor="middle">0.8 ms</text>
+          <text x="240" y="64" fill="#6ee7b7" font-size="18" font-weight="bold" text-anchor="middle">0.8 ms</text>
-          <text x="240" y="74" fill="#64748b" font-size="7" text-anchor="middle">ITL (memory-bound)</text>
+          <text x="240" y="76" fill="#64748b" font-size="7" text-anchor="middle">ITL (memory-bound)</text>
-          <rect x="70" y="90" width="180" height="22" rx="4" fill="rgba(245,158,11,0.1)" stroke="rgba(245,158,11,0.3)" stroke-width="1"/>
+          <rect x="70" y="96" width="180" height="22" rx="4" fill="rgba(245,158,11,0.1)" stroke="rgba(245,158,11,0.3)" stroke-width="1"/>
-          <text x="160" y="105" fill="#f59e0b" font-size="8" text-anchor="middle">KV-Cache: 2.1 GB / 80 GB available</text>
+          <text x="160" y="111" fill="#f59e0b" font-size="8" text-anchor="middle">KV-Cache: 2.1 GB / 80 GB available</text>
        </svg>
      </div>
      <div class="im-slide-caption">Model the two phases of autoregressive inference and KV-cache memory pressure.</div>
    </div>
    <div class="im-slide" data-index="4">
      <div class="im-slide-label">Distributed Training</div>
      <div class="im-slide-viz">
        <svg viewBox="0 0 320 130" class="im-distributed-svg">
          <text x="160" y="15" fill="#94a3b8" font-size="9" text-anchor="middle">256&#xd7; H100 &mdash; GPT-3 175B</text>
          <!-- Parallelism strategy boxes -->
          <rect x="10" y="28" width="95" height="42" rx="6" fill="rgba(124,58,237,0.1)" stroke="rgba(124,58,237,0.3)" stroke-width="1"/>
          <text x="57.5" y="45" fill="#a78bfa" font-size="8" font-weight="bold" text-anchor="middle">Data Parallel</text>
          <text x="57.5" y="62" fill="#c4b5fd" font-size="16" font-weight="bold" text-anchor="middle">32&#xd7;</text>
          <rect x="112.5" y="28" width="95" height="42" rx="6" fill="rgba(56,189,248,0.1)" stroke="rgba(56,189,248,0.3)" stroke-width="1"/>
          <text x="160" y="45" fill="#38bdf8" font-size="8" font-weight="bold" text-anchor="middle">Tensor Parallel</text>
          <text x="160" y="62" fill="#7dd3fc" font-size="16" font-weight="bold" text-anchor="middle">4&#xd7;</text>
          <rect x="215" y="28" width="95" height="42" rx="6" fill="rgba(16,185,129,0.1)" stroke="rgba(16,185,129,0.3)" stroke-width="1"/>
          <text x="262.5" y="45" fill="#10b981" font-size="8" font-weight="bold" text-anchor="middle">Pipeline Parallel</text>
          <text x="262.5" y="62" fill="#6ee7b7" font-size="16" font-weight="bold" text-anchor="middle">2&#xd7;</text>
          <!-- Results row -->
          <line x1="20" y1="82" x2="300" y2="82" stroke="rgba(148,163,184,0.15)" stroke-width="1"/>
          <text x="85" y="98" fill="#94a3b8" font-size="8" text-anchor="middle">Scaling Efficiency</text>
          <text x="85" y="118" fill="#a78bfa" font-size="18" font-weight="bold" text-anchor="middle">74%</text>
          <text x="235" y="98" fill="#94a3b8" font-size="8" text-anchor="middle">Pipeline Bubble</text>
          <text x="235" y="114" fill="#f59e0b" font-size="18" font-weight="bold" text-anchor="middle">6.3%</text>
        </svg>
      </div>
      <div class="im-slide-caption">3D parallelism decomposition: data, tensor, and pipeline parallel scaling on GPU clusters.</div>
    </div>
    <div class="im-slide" data-index="5">
      <div class="im-slide-label">Total Cost of Ownership</div>
      <div class="im-slide-viz">
        <svg viewBox="0 0 320 130" class="im-tco-svg">
          <text x="160" y="15" fill="#94a3b8" font-size="9" text-anchor="middle">64&#xd7; H100 Cluster &mdash; 3-Year TCO</text>
          <!-- Stacked cost bars -->
          <text x="50" y="42" fill="#94a3b8" font-size="9" text-anchor="end">CapEx</text>
          <rect x="55" y="30" width="0" height="16" rx="3" fill="#38bdf8"><animate attributeName="width" from="0" to="200" dur="1.5s" fill="freeze"/></rect>
          <text x="260" y="42" fill="#94a3b8" font-size="8">$2.0M</text>
          <text x="50" y="68" fill="#94a3b8" font-size="9" text-anchor="end">Energy</text>
          <rect x="55" y="56" width="0" height="16" rx="3" fill="#f59e0b"><animate attributeName="width" from="0" to="120" dur="1.5s" fill="freeze" begin="0.1s"/></rect>
          <text x="180" y="68" fill="#94a3b8" font-size="8">$1.2M</text>
          <text x="50" y="94" fill="#94a3b8" font-size="9" text-anchor="end">Maint.</text>
          <rect x="55" y="82" width="0" height="16" rx="3" fill="#10b981"><animate attributeName="width" from="0" to="50" dur="1.5s" fill="freeze" begin="0.2s"/></rect>
          <text x="110" y="94" fill="#94a3b8" font-size="8">$0.5M</text>
          <!-- Total -->
          <line x1="55" y1="108" x2="260" y2="108" stroke="rgba(148,163,184,0.2)" stroke-width="1"/>
          <text x="55" y="124" fill="#94a3b8" font-size="9">Total TCO</text>
          <text x="260" y="124" fill="#e2e8f0" font-size="14" font-weight="bold" text-anchor="end">$3.7M</text>
        </svg>
      </div>
      <div class="im-slide-caption">Break down hardware, energy, and maintenance costs over any time horizon.</div>
    </div>
  </div>
  <div class="im-carousel-dots">
    <button class="im-dot im-dot-active" data-slide="0" aria-label="Roofline Analysis"></button>
    <button class="im-dot" data-slide="1" aria-label="Hardware Comparison"></button>
    <button class="im-dot" data-slide="2" aria-label="Sustainability"></button>
    <button class="im-dot" data-slide="3" aria-label="LLM Serving"></button>
    <button class="im-dot" data-slide="4" aria-label="Distributed Training"></button>
    <button class="im-dot" data-slide="5" aria-label="Total Cost of Ownership"></button>
  </div>
 </div>
 <div class="im-stats">
  <div class="im-stat"><span class="im-stat-num">6</span><span class="im-stat-label">Analytical Solvers</span></div>
  <div class="im-stat"><span class="im-stat-num">18+</span><span class="im-stat-label">Hardware Devices</span></div>
  <div class="im-stat"><span class="im-stat-num">13+</span><span class="im-stat-label">ML Workloads</span></div>
  <div class="im-stat"><span class="im-stat-num">4</span><span class="im-stat-label">Grid Regions</span></div>
 </div>
 </div>
 </div>
 ```
@@ -252,7 +244,7 @@ print(f"Latency:    {profile.latency.to('ms'):~.2f}")  # → 0.34 ms
 print(f"Throughput: {profile.throughput:.0f} img/s")     # → 2941 img/s
 ```
-At batch=1, ResNet-50 loads ~50 MB of weights but performs only ~8 GFLOPs, making it firmly memory-bound on any modern GPU. The solver identifies this in microseconds using the **Iron Law**:
+At batch=1, ResNet-50 loads ~50 MB of weights but performs only ~8 GFLOPs, making it firmly memory-bound on any modern GPU. The solver identifies this in microseconds using the **Iron Law** [@williams2009roofline]:
 $$T = \max\!\left(\frac{\text{FLOPs}}{\text{Peak} \times \eta},\ \frac{\text{Bytes}}{\text{BW}}\right)$$
@@ -272,42 +264,42 @@ Every solver takes typed registry objects and returns analytically grounded esti
 ::: {.im-solver-card .im-solver-roofline}
 ::: {.im-solver-icon}
 :::
-**Roofline Analysis**\
+**Roofline Analysis**
 Compute vs. memory bottleneck identification using the Iron Law. Single-node latency and throughput.
 :::
 ::: {.im-solver-card .im-solver-distributed}
 ::: {.im-solver-icon}
 :::
-**3D Parallelism**\
+**3D Parallelism**
 Data, tensor, and pipeline parallel scaling efficiency. Ring all-reduce and pipeline bubble overhead.
 :::
 ::: {.im-solver-card .im-solver-serving}
 ::: {.im-solver-icon}
 :::
-**LLM Serving**\
+**LLM Serving**
 Time-to-first-token (TTFT), inter-token latency (ITL), and KV-cache memory pressure.
 :::
 ::: {.im-solver-card .im-solver-tco}
 ::: {.im-solver-icon}
 :::
-**Total Cost of Ownership**\
+**Total Cost of Ownership**
 CapEx, OpEx, electricity, maintenance, and per-query economics over any time horizon.
 :::
 ::: {.im-solver-card .im-solver-sustain}
 ::: {.im-solver-icon}
 :::
-**Sustainability**\
+**Sustainability**
 Energy, carbon footprint (kg CO₂e), and water usage across datacenter regions.
 :::
 ::: {.im-solver-card .im-solver-reliability}
 ::: {.im-solver-icon}
 :::
-**Reliability**\
+**Reliability**
 Fleet MTBF, failure probability, and Young-Daly optimal checkpoint interval.
 :::
@@ -368,21 +360,21 @@ Same model, same GPU, yet up to 41x difference in carbon footprint depending on
 ::: {.im-audience}
 ::: {.im-audience-item .im-aud-student}
-**Students**
+[**Students**](for-students.qmd)
-Build intuition for *why* ML systems behave as they do. Run roofline analysis, see the memory wall, compute carbon footprints, all without needing GPU hardware. Pairs chapter-by-chapter with the textbook.
+Build intuition for *why* ML systems behave as they do. Run roofline analysis, see the memory wall, compute carbon footprints — all without needing GPU hardware. [See learning path &rarr;](for-students.qmd)
 :::
 ::: {.im-audience-item .im-aud-instructor}
-**Instructors**
+[**Instructors**](for-instructors.qmd)
-Assign analytically grounded problem sets with deterministic, reproducible outputs. All specs sourced from vetted datasheets. Works in Jupyter and Quarto notebooks.
+Assign analytically grounded problem sets with deterministic, reproducible outputs. All specs sourced from vetted datasheets. [See course integration &rarr;](for-instructors.qmd)
 :::
 ::: {.im-audience-item .im-aud-engineer}
-**Engineers & Researchers**
+[**Engineers & Researchers**](for-engineers.qmd)
-Pre-deployment estimates for any architecture. Model distributed overheads, LLM serving latency, and multi-region sustainability before provisioning hardware.
+Pre-deployment estimates for any architecture. Model distributed overheads, LLM serving latency, and multi-region sustainability before provisioning hardware. [See quick API guide &rarr;](for-engineers.qmd)
 :::
 :::
--- a/mlsysim/docs/math.qmd
+++ b/mlsysim/docs/math.qmd
@@ -2,7 +2,6 @@
 title: "Mathematical Foundations"
 subtitle: "The First-Principles Equations Behind Every MLSYSIM Solver"
 ---
 MLSYSIM avoids "black box" heuristics. Every output traces back to one of the equations below.
 Before diving into code, read this page to understand *what* the solvers are computing and *why*.
@@ -18,9 +17,12 @@ Click any solver name to go directly to its API documentation.
 *Implemented in [`mlsysim.core.solver.SingleNodeSolver`](api/core.solver.SingleNodeSolver.qmd).*
-**The physical intuition**: Hardware has two speed limits—how fast it can compute, and how fast it can
+::: {.callout-note appearance="simple" icon=false}
-move data from memory to the compute units. Your actual throughput is determined by whichever limit
+**💡 Intuition: The Roofline Bottleneck**
-you hit first. This is why we take the *maximum* of two terms, not their sum.
+Hardware has two speed limits—how fast it can compute, and how fast it can move data from memory to the compute units. Your actual throughput is determined by whichever limit you hit first. This is why we take the *maximum* of two terms, not their sum.
 **📚 Source:** @williams2009roofline
 :::
 $$
 T = \max \left( \frac{\text{FLOPs}}{\text{Peak\_FLOPs} \times \eta},\ \frac{\text{Bytes}}{\text{Memory\_BW}} \right) + \text{Dispatch\_Tax}
@@ -96,25 +98,45 @@ at low batch sizes. Upgrading from 100 Gb Ethernet to InfiniBand NDR (400 Gb/s)
 ### 2.3 Pipeline Parallelism Bubble
-**Pipeline parallelism** splits a model's layers across multiple stages (nodes). Stage 1
+**Pipeline parallelism** splits a model's layers across multiple stages (nodes). Stage 1 processes layers 1–20, stage 2 processes layers 21–40, and so on. This allows models too large for a single GPU to be trained across multiple nodes.
 processes layers 1–20, stage 2 processes layers 21–40, and so on. This allows models too large
 for a single GPU to be trained across multiple nodes.
-The cost is a **pipeline bubble**: at the start of each batch, downstream stages sit idle
+::: {.callout-note appearance="simple" icon=false}
-while waiting for upstream stages to produce output. When a pipeline of depth $P$ processes
+**💡 Intuition: Shrinking the Pipeline Bubble**
-$M$ microbatches, the fraction of time spent idle is:
+In standard 1F1B pipeline parallelism, GPUs sit idle waiting for microbatches to traverse the network. You can't change the speed of light, but you *can* change the software schedule. By assigning multiple "virtual stages" ($V$) to a single GPU, we interleave the execution. While a GPU is waiting for the next microbatch of its *first* virtual stage, it can compute a microbatch for its *second* virtual stage, effectively hiding the network latency behind useful compute.
 **📚 Source:** @narayanan2021efficient
 :::
 The cost of pipelining is a **pipeline bubble**: at the start of each batch, downstream stages sit idle while waiting for upstream stages to produce output. When a pipeline of depth $P$ processes $M$ microbatches with $V$ virtual stages per GPU, the fraction of time spent idle is:
 $$
-\text{Bubble Fraction} = \frac{P - 1}{P - 1 + M}
+\text{Bubble Fraction} = \frac{P - 1}{V \times M + P - 1}
 $$
-The intuition: with $P$ stages and $M$ microbatches, the pipeline takes $P - 1 + M$ time
+The intuition: with $P$ stages and $M$ microbatches, the pipeline takes time to fill and drain. The solution is to either increase $M$ (more microbatches) or increase $V$ (interleaved schedules). Both make the startup and drain phases a smaller fraction of total time.
 steps to complete, but only $M$ of those steps have all stages active. The solution is to
 increase $M$ — more microbatches mean the startup and drain phases become a smaller fraction
 of total time.
-**Implication**: To keep the bubble below 5%, you need $M \geq 19 \cdot (P-1)$ microbatches.
+**Implication**: To keep the bubble below 5% using standard 1F1B ($V=1$), you need $M \geq 19 \cdot (P-1)$ microbatches. With a 4-stage pipeline ($P=4$), you need at least 57 microbatches. By using $V=2$ virtual stages, you cut the required microbatches in half.
-With a 4-stage pipeline (P=4), you need at least 57 microbatches to achieve 95% efficiency.
+
 ### 2.4 Expert Parallelism (Mixture of Experts)
 ::: {.callout-note appearance="simple" icon=false}
 **💡 Intuition: Breaking the Iron Law**
 Standard dense Transformers obey a strict "Iron Law": if you double the parameters, you double the memory *and* the compute FLOPs. Mixture of Experts (MoE) breaks this law. It routes tokens only to specific "expert" subnetworks. This means your **Memory Bound** is dictated by the massive *Total Parameters*, but your **Compute Bound** is dictated only by the much smaller *Active Parameters*. The physical tradeoff is a massive network bandwidth tax (All-to-All communication) to route tokens to the right experts across the cluster.
 **📚 Source:** @shazeer2017outrageously
 :::
 To model MoE, we move from 3D to **4D Parallelism**:
 $$
 \text{Data Parallelism} = \frac{\text{Total GPUs}}{TP \times PP \times EP}
 $$
 Where $EP$ is Expert Parallelism. If $EP > 1$, the solver adds an All-to-All communication penalty for token routing:
 $$
 T_{\text{all-to-all}} = \frac{N-1}{N} \times \frac{\text{Message Size}}{\text{Bandwidth}} + (N-1) \times \text{Latency}
 $$
 ---
@@ -198,6 +220,27 @@ Where:
 ---
 ## 6. Cluster Reliability (The Young-Daly Model)
 *Implemented in [`mlsysim.core.solver.ReliabilitySolver`](api/core.solver.ReliabilitySolver.qmd).*
 ::: {.callout-note appearance="simple" icon=false}
 **💡 Intuition: The Cost of Checkpointing**
 When training massive models on thousands of GPUs for months, hardware failures are not a possibility; they are a statistical certainty. If a node fails, the job crashes and you lose all progress since the last checkpoint. You want to save checkpoints frequently to minimize lost work, but writing a 140GB checkpoint to remote storage takes time, pausing the training. The Young-Daly model calculates the optimal balance between *time wasted saving checkpoints* and *time wasted re-computing after a failure*.
 **📚 Source:** @young1974first and @daly2006higher
 :::
 The optimal checkpoint interval $\tau_{\text{opt}}$ is defined by the Mean Time Between Failures ($M$) and the time it takes to write a single checkpoint ($\delta$):
 $$
 \tau_{\text{opt}} = \sqrt{2 \times \delta \times M}
 $$
 For a cluster, the collective $M$ drops linearly with the number of components. If a single node has an MTBF of 10,000 hours, a cluster of 1,000 nodes will have an MTBF of just 10 hours ($10,000 / 1000$).
 ---
 ::: {.callout-note}
 ## Limitations of First-Order Models
 These equations are first-order analytical models. They assume:
--- a/mlsysim/docs/references.bib
+++ b/mlsysim/docs/references.bib
@@ -166,3 +166,17 @@
  year      = {2019},
  doi       = {10.1109/ICCAD45719.2019.8942149}
 }
@article{narayanan2021efficient,
  title   = {Efficient Large-Scale Language Model Training on GPU Clusters Using Megatron-LM},
  author  = {Narayanan, Deepak and Shoeybi, Mohammad and Casper, Jared and others},
  journal = {Proceedings of the International Conference for High Performance Computing, Networking, Storage and Analysis (SC)},
  year    = {2021}
 }
@article{shazeer2017outrageously,
  title   = {Outrageously Large Neural Networks: The Sparsely-Gated Mixture-of-Experts Layer},
  author  = {Shazeer, Noam and Mirhoseini, Azalia and Maziarz, Krzysztof and others},
  journal = {arXiv preprint arXiv:1701.06538},
  year    = {2017}
 }
--- a/mlsysim/docs/solver-guide.qmd
+++ b/mlsysim/docs/solver-guide.qmd
@@ -2,7 +2,6 @@
 title: "Which Solver Do I Need?"
 subtitle: "A decision guide for choosing the right MLSYSIM analytical tool."
 ---
 MLSYSIM provides six specialized solvers, each designed to answer a different class of question about ML systems. This page helps you pick the right one.
 ---
--- a/mlsysim/docs/styles/landing.css
+++ b/mlsysim/docs/styles/landing.css
@@ -28,14 +28,14 @@
 .im-hero {
  background: linear-gradient(165deg, #0f172a 0%, #1e293b 100%);
  color: white;
-  padding: 4.5rem 2rem 3rem;
+  padding: 6.5rem 2rem 4rem;
  position: relative;
  overflow: hidden;
 }
 /* Carousel + stats portion: no extra padding on top, smooth continuation */
 .im-hero.im-hero-showcase {
-  padding: 0 2rem 3rem;
+  padding: 2rem 2rem 2rem;
 }
 /* Subtle animated grid overlay */
@@ -134,7 +134,7 @@
  font-size: 0.9rem;
  color: #94a3b8;
  line-height: 1.7;
-  margin-bottom: 0;
+  margin-bottom: 2rem;
 }
 /* ---------- INSTALL ROW ---------- */
@@ -185,7 +185,7 @@ code.im-cmd {
  justify-content: center;
  gap: 0.75rem;
  flex-wrap: wrap;
-  margin-bottom: 0;
+  margin-bottom: 2rem;
  animation: fade-up 0.6s ease both;
  animation-delay: 0.8s;
 }
@@ -227,14 +227,45 @@ code.im-cmd {
 /* ---------- CAPABILITY CAROUSEL ---------- */
 .im-carousel {
  max-width: 480px;
-  margin: 0 auto 2.5rem;
+  margin: 2.5rem auto 1rem;
  animation: fade-up 0.6s ease both;
  animation-delay: 0.9s;
  position: relative;
 }
 .im-arrow {
  position: absolute;
  top: 50%;
  transform: translateY(-50%);
  z-index: 10;
  background: rgba(255,255,255,0.06);
  border: 1px solid rgba(255,255,255,0.15);
  color: #94a3b8;
  width: 36px;
  height: 36px;
  border-radius: 50%;
  font-size: 1.3rem;
  line-height: 1;
  cursor: pointer;
  display: flex;
  align-items: center;
  justify-content: center;
  transition: all 150ms ease;
  padding: 0;
 }
 .im-arrow:hover {
  background: rgba(255,255,255,0.12);
  border-color: rgba(255,255,255,0.3);
  color: white;
 }
 .im-arrow-prev { left: -52px; }
 .im-arrow-next { right: -52px; }
 .im-carousel-track {
  position: relative;
-  min-height: 195px;
+  min-height: 280px;
 }
 .im-slide {
@@ -242,10 +273,14 @@ code.im-cmd {
  top: 0;
  left: 0;
  right: 0;
  bottom: 0;
  opacity: 0;
  transform: translateY(8px);
  transition: opacity 0.5s ease, transform 0.5s ease;
  pointer-events: none;
  display: flex;
  flex-direction: column;
  justify-content: center;
 }
 .im-slide-active {
@@ -255,7 +290,7 @@ code.im-cmd {
 }
 .im-slide-label {
-  font-size: 0.7rem;
+  font-size: 0.75rem;
  font-weight: 600;
  text-transform: uppercase;
  letter-spacing: 0.1em;
@@ -266,9 +301,10 @@ code.im-cmd {
 .im-slide-viz {
  background: rgba(255,255,255,0.04);
  backdrop-filter: blur(8px);
  border: 1px solid rgba(255,255,255,0.08);
  border-radius: 10px;
-  padding: 0.75rem;
+  padding: 1.25rem;
  margin-bottom: 0.6rem;
 }
@@ -279,6 +315,7 @@ code.im-cmd {
 }
 .im-slide-caption {
  padding: 0 1rem;
  font-size: 0.78rem;
  color: #94a3b8;
  text-align: center;
@@ -289,7 +326,8 @@ code.im-cmd {
  display: flex;
  justify-content: center;
  gap: 0.5rem;
-  margin-top: 1rem;
+  margin-top: 1.5rem;
  margin-bottom: 0.75rem;
 }
 .im-dot {
@@ -318,35 +356,41 @@ code.im-cmd {
 .im-stats {
  display: flex;
  justify-content: center;
-  gap: 2.5rem;
+  gap: 2rem;
  flex-wrap: wrap;
-  padding-top: 2rem;
+  margin: 2rem auto 0.5rem;
-  border-top: 1px solid rgba(255,255,255,0.08);
+  padding: 0;
  animation: fade-up 0.6s ease both;
-  animation-delay: 0.95s;
+  animation-delay: 0.6s;
 }
 .im-stat {
  text-align: center;
  padding: 0.5rem 1rem;
  background: rgba(255, 255, 255, 0.03);
  border-radius: 8px;
  border: 1px solid rgba(255, 255, 255, 0.05);
 }
 .im-stat-num {
  font-size: 1.25rem; /* Adjusted for text-based stats */
  display: block;
-  font-size: 1.75rem;
+  font-size: 1.5rem;
  font-weight: 800;
  color: #38bdf8;
  letter-spacing: -0.02em;
  line-height: 1.1;
  text-shadow: 0 0 20px rgba(56, 189, 248, 0.2);
 }
 .im-stat-label {
  display: block;
-  font-size: 0.72rem;
+  font-size: 0.65rem;
-  font-weight: 500;
+  font-weight: 600;
-  color: #64748b;
+  color: #94a3b8;
  text-transform: uppercase;
-  letter-spacing: 0.08em;
+  letter-spacing: 0.05em;
-  margin-top: 0.3rem;
+  margin-top: 0.2rem;
 }
 /* ---------- CONTENT CONTAINER ---------- */
@@ -393,7 +437,7 @@ code.im-cmd {
  display: grid;
  grid-template-columns: repeat(auto-fit, minmax(250px, 1fr));
  gap: 0.875rem;
-  margin-top: 1rem;
+  margin-top: 1.5rem;
 }
 .im-solver-card {
@@ -440,7 +484,7 @@ code.im-cmd {
  display: grid;
  grid-template-columns: repeat(auto-fill, minmax(240px, 1fr));
  gap: 1rem;
-  margin-top: 1rem;
+  margin-top: 1.5rem;
 }
 .im-tutorial-card {
@@ -502,7 +546,7 @@ code.im-cmd {
  display: grid;
  grid-template-columns: repeat(auto-fit, minmax(240px, 1fr));
  gap: 1.5rem;
-  margin-top: 1rem;
+  margin-top: 1.5rem;
 }
 .im-audience-item {
@@ -529,6 +573,30 @@ code.im-cmd {
 .im-aud-instructor { border-left-color: #d97706; }
 .im-aud-engineer   { border-left-color: #059669; }
 .im-audience-item a {
  text-decoration: none !important;
 }
 .im-audience-item a strong {
  color: #1e293b;
  transition: color 0.15s ease;
 }
 .im-audience-item a:hover strong {
  color: #0284c7;
 }
 .im-audience-item p a {
  font-size: 0.82rem;
  font-weight: 600;
  color: #0284c7 !important;
  text-decoration: none !important;
 }
 .im-audience-item p a:hover {
  text-decoration: underline !important;
 }
 /* ---------- RESPONSIVE ---------- */
@media (max-width: 768px) {
  .im-hero {
@@ -539,6 +607,16 @@ code.im-cmd {
    padding: 0 1.5rem 2rem;
  }
  .im-arrow-prev { left: -6px; }
  .im-arrow-next { right: -6px; }
  .im-arrow {
    width: 30px;
    height: 30px;
    font-size: 1.1rem;
    background: rgba(15,23,42,0.8);
    backdrop-filter: blur(4px);
  }
  .im-title {
    font-size: clamp(2.25rem, 8vw, 3rem);
  }
@@ -552,6 +630,8 @@ code.im-cmd {
  }
  .im-stat-num {
  font-size: 1.25rem; /* Adjusted for text-based stats */
  text-shadow: 0 0 20px rgba(56, 189, 248, 0.3);
    font-size: 1.4rem;
  }
@@ -596,3 +676,67 @@ code.im-cmd {
    max-width: 240px;
  }
 }
 /* Overriding stats for top placement */
 .im-stats {
  display: grid;
  grid-template-columns: repeat(4, 1fr);
  gap: 1rem;
  max-width: 800px;
  margin: 2.5rem auto 1.5rem;
  padding: 0;
  animation: fade-up 0.6s ease both;
  animation-delay: 0.6s;
 }
 .im-stat {
  text-align: center;
  padding: 0.75rem 0.5rem;
  background: rgba(255, 255, 255, 0.03);
  border-radius: 10px;
  border: 1px solid rgba(255, 255, 255, 0.06);
  backdrop-filter: blur(4px);
  transition: transform 0.2s ease, background 0.2s ease;
 }
 .im-stat:hover {
  background: rgba(255, 255, 255, 0.05);
  transform: translateY(-2px);
 }
 .im-stat-num {
  font-size: 1.25rem; /* Adjusted for text-based stats */
  display: block;
  font-size: clamp(1rem, 2.5vw, 1.5rem);
  font-weight: 800;
  color: #38bdf8;
  letter-spacing: -0.02em;
  line-height: 1.1;
 }
 .im-stat-label {
  display: block;
  font-size: 0.6rem;
  font-weight: 700;
  color: #94a3b8;
  text-transform: uppercase;
  letter-spacing: 0.08em;
  margin-top: 0.25rem;
  white-space: nowrap;
 }
 /* Stable wrapping for smaller screens */
@media (max-width: 768px) {
  .im-stats {
    grid-template-columns: repeat(2, 1fr);
    max-width: 400px;
    gap: 0.75rem;
  }
 }
@media (max-width: 400px) {
  .im-stat-num {
  font-size: 1.25rem; /* Adjusted for text-based stats */ font-size: 1.3rem; }
  .im-stat-label { font-size: 0.55rem; }
 }
--- a/mlsysim/docs/tutorials/distributed.qmd
+++ b/mlsysim/docs/tutorials/distributed.qmd
@@ -2,7 +2,6 @@
 title: "Distributed Training: 3D Parallelism and Scaling Efficiency"
 subtitle: "Discover why 1024 GPUs rarely deliver 1024× speedup — and how to minimize the gap."
 ---
 ::: {.callout-note}
 ## Background: Why distributed training?
@@ -139,11 +138,11 @@ result_dp = solver.solve(
 )
 node_perf = result_dp["node_performance"]
-print(f"Single-GPU compute time:     {node_perf.latency.to('ms'):.1f} ms/step")
+print(f"Single-GPU compute time:     {node_perf.latency.to('ms'):~.1f}/step")
-print(f"DP all-reduce overhead:      {result_dp['dp_communication_latency'].to('ms'):.2f} ms")
+print(f"DP all-reduce overhead:      {result_dp['dp_communication_latency'].to('ms'):~.2f}")
-print(f"Pipeline bubble:             {result_dp['pipeline_bubble_latency'].to('ms'):.2f} ms")
+print(f"Pipeline bubble:             {result_dp['pipeline_bubble_latency'].to('ms'):~.2f}")
 print(f"")
-print(f"Total step latency:          {result_dp['step_latency_total'].to('ms'):.1f} ms")
+print(f"Total step latency:          {result_dp['step_latency_total'].to('ms'):~.1f}")
 print(f"Scaling efficiency:          {result_dp['scaling_efficiency']:.1%}")
 print(f"Effective throughput:        {result_dp['effective_throughput'].magnitude:.0f} samples/s")
 print(f"Parallelism:                 DP={result_dp['parallelism']['dp']}  TP={result_dp['parallelism']['tp']}  PP={result_dp['parallelism']['pp']}")
@@ -166,12 +165,12 @@ network bandwidth.
 ## 4. Ring All-Reduce: The Network Tax
 The `DP all-reduce overhead` comes from the **ring all-reduce algorithm**, which is the
-standard method for gradient synchronization. Its time depends on:
+standard method for gradient synchronization.
-$$t_{\text{allreduce}} = 2 \times \frac{M \times (N-1)}{N \times B_{\text{eff}}}$$
+::: {.callout-note}
-
+## 🧮 See the Math
-Where $M$ is the message size (model gradient = 2× weights in fp16), $N$ is the number
+For the full equation deriving All-Reduce overhead from model size, node count, and fabric bandwidth, see the [Mathematical Foundations: Ring All-Reduce](../math.qmd#ring-all-reduce-data-parallelism).
-of data-parallel replicas, and $B_{\text{eff}}$ is the effective inter-node bandwidth.
+:::
 The following sweep shows how fabric bandwidth affects overhead:
@@ -228,9 +227,10 @@ The downside: a **pipeline bubble**. The first microbatch must flow through all
 the last stage can start processing the second microbatch. During that startup phase, most
 GPUs are idle.
-$$\text{Bubble fraction} = \frac{P - 1}{P - 1 + M}$$
+::: {.callout-note}
-
+## 🧮 See the Math
-Where $P$ is the pipeline depth (number of stages) and $M$ is the number of microbatches.
+For the full equation governing pipeline bubbles and interleaved 1F1B schedules, see the [Mathematical Foundations: Pipeline Parallelism Bubble](../math.qmd#pipeline-parallelism-bubble).
 :::
 ```{python}
 print(f"{'PP stages':>10}  {'Microbatches':>13}  {'Bubble %':>9}  {'Comm (ms)':>10}  {'Efficiency':>11}")
--- a/mlsysim/docs/tutorials/hello_world.qmd
+++ b/mlsysim/docs/tutorials/hello_world.qmd
@@ -2,7 +2,6 @@
 title: "Hello World: Single-Node Roofline"
 subtitle: "Predict model performance on hardware before writing a single CUDA kernel."
 ---
 ::: {.callout-note}
 ## Prerequisites
 Complete the [Getting Started](../getting-started.qmd) guide before this tutorial. It introduces the `Engine.solve` API and the MLSys Zoo.
@@ -98,7 +97,7 @@ profile = Engine.solve(
 )
 print(f"Bottleneck: {profile.bottleneck}")
-print(f"Latency:    {profile.latency.to('ms'):.3f} ms per inference")
+print(f"Latency:    {profile.latency.to('ms'):~.3f} per inference")
 print(f"Throughput: {profile.throughput:.0f} images/sec")
 ```
@@ -129,7 +128,7 @@ for batch in [1, 4, 16, 32, 64, 128, 256]:
    print(
        f"{batch:>6}  {p.bottleneck:<16}  "
        f"{p.throughput:>10.0f}/s  "
-        f"{p.latency.to('ms'):>8.2f} ms"
+        f"{p.latency.to('ms').magnitude:>8.2f} ms"
    )
 ```
--- a/mlsysim/docs/tutorials/images/roofline_hello_world.png
+++ b/mlsysim/docs/tutorials/images/roofline_hello_world.png
--- a/mlsysim/docs/tutorials/index.qmd
+++ b/mlsysim/docs/tutorials/index.qmd
@@ -1,11 +1,7 @@
 ---
 title: "Tutorials"
 subtitle: "Step-by-step guides for modeling ML Systems."
 format:
  html:
    toc: false
 ---
 These tutorials are designed to build intuition for ML systems using the `mlsysim` framework.
 They map directly to chapters in the *Machine Learning Systems* textbook—start at the beginning
 or jump to any topic.
--- a/mlsysim/docs/tutorials/llm_serving.qmd
+++ b/mlsysim/docs/tutorials/llm_serving.qmd
@@ -2,7 +2,6 @@
 title: "LLM Serving Lab: TTFT, ITL, and the Memory Wall"
 subtitle: "Model the two physical regimes of LLM inference before deploying a single server."
 ---
 ::: {.callout-note}
 ## Background: What is an LLM and why is serving different?
@@ -139,12 +138,12 @@ print(f"Memory util:       {result['memory_utilization']:.1%}")
 ## 3. The KV-Cache Memory Wall
 The KV-cache stores the Key and Value matrices from every attention layer for every token
-in the active context. Its size grows as:
+in the active context. This statefulness is what makes LLM decoding uniquely memory-bound.
-$$\text{KV-Cache} = 2 \times L \times H_{kv} \times d_{head} \times S \times B \times \text{bpp}$$
+::: {.callout-note}
-
+## 🧮 See the Math
-Where $L$ = layers, $H_{kv}$ = KV heads, $S$ = sequence length, $B$ = batch size,
+To see the exact formula for how KV-Cache size scales with sequence length, batch size, and network architecture, see the [Mathematical Foundations: KV-Cache Size](../math.qmd#kv-cache-size).
-$\text{bpp}$ = bytes per parameter.
+:::
 This means doubling `batch_size` doubles the KV-cache. At some point, you hit the
 **memory wall** — the combined model + KV-cache exceeds the accelerator's HBM capacity.
--- a/mlsysim/docs/tutorials/sustainability.qmd
+++ b/mlsysim/docs/tutorials/sustainability.qmd
@@ -2,7 +2,6 @@
 title: "Sustainability Lab: Modeling Carbon Footprint"
 subtitle: "Same model, same hardware — 41x difference in carbon footprint."
 ---
 ::: {.callout-note}
 ## Prerequisites
 This tutorial can be completed independently, but completing the [Hello World tutorial](hello_world.qmd) first provides useful context on how hardware performance relates to energy consumption.
--- a/mlsysim/docs/whitepaper.qmd
+++ b/mlsysim/docs/whitepaper.qmd
@@ -6,7 +6,6 @@ affiliation: "Harvard University"
 bibliography: references.bib
 csl: https://raw.githubusercontent.com/citation-style-language/styles/master/ieee.csl
 ---
 ## Abstract
 Machine learning systems education faces a practical gap: the hardware students need to reason about — H100 clusters, InfiniBand fabrics, multi-megawatt datacenters — is inaccessible for hands-on experimentation. We present **MLSYSIM**, a first-principles analytical engine designed as the companion framework to the *Machine Learning Systems* textbook [@mlsysbook2024]. MLSYSIM provides six composable solvers covering single-node performance (Roofline), distributed training (3D Parallelism), LLM serving (Pre-fill vs. Decode), Total Cost of Ownership, carbon footprint, and cluster reliability. All quantities carry physical units via `pint.Quantity` types, enforcing dimensional correctness at runtime. A vetted registry of 18 hardware devices, 15 model architectures, and 4 regional grid profiles provides a single source of truth that keeps textbook exercises grounded in real-world specifications. The platform is open source and available at [mlsysbook.ai](https://mlsysbook.ai).
@@ -62,6 +61,7 @@ This paper makes three contributions:
 MLSYSIM organizes the ML systems domain into five composable layers, following a strategy we call **Progressive Lowering**: abstract workload demand is progressively mapped onto concrete hardware supply through intermediate representations.
 ```{mermaid}
 %%{init: {'theme': 'neutral'}}%%
 %%| fig-cap: "The MLSYSIM 5-Layer Stack. Workloads (demand) are lowered onto Hardware (supply) through Infrastructure and Systems layers. Solvers bridge demand and supply to produce analytical profiles."
 %%| fig-width: 100%
 flowchart TB
@@ -166,7 +166,7 @@ LLM inference has two physically distinct phases:
 2. **Decode** (Memory-Bound): Each token requires reading all model weights plus the KV-cache from HBM. Latency per token scales with `(weight_bytes + kv_cache_bytes) / bandwidth`. This determines Inter-Token Latency (ITL).
-The solver also computes KV-cache memory:
+The solver also computes KV-cache memory [@kwon2023efficient]:
 $$\text{KV-cache} = 2 \times n_\text{layers} \times n_\text{kv\_heads} \times d_\text{head} \times \text{seq\_len} \times \text{batch} \times \text{bytes/element}$$
@@ -182,17 +182,17 @@ For fleet-scale training, the solver decomposes the workload using three paralle
 The total accelerator count constrains the decomposition: `dp_size * tp_size * pp_size = total_accelerators`.
-**Communication overhead** is modeled using the ring all-reduce formula:
+**Communication overhead** is modeled using the ring all-reduce formula [@dean2012large]:
 $$T_{\text{ring}} = 2 \cdot \frac{N-1}{N} \cdot \frac{S}{\text{BW}} + 2(N-1) \cdot \alpha$$
 where $N$ is the number of workers, $S$ is the message size (gradient tensor bytes), BW is the effective fabric bandwidth (accounting for oversubscription), and $\alpha$ is the per-message latency.
-**Pipeline bubble fraction** follows the standard model:
+**Pipeline bubble fraction** follows the interleaved pipeline model [@narayanan2021efficient]:
-$$\text{Bubble} = \frac{P - 1}{P - 1 + M}$$
+$$\text{Bubble} = \frac{P - 1}{V \times M + P - 1}$$
-where $P$ is the pipeline depth and $M$ is the number of microbatches.
+where $P$ is the pipeline depth, $M$ is the number of microbatches, and $V$ is the number of virtual stages per GPU.
 **Scaling efficiency** is computed as:
@@ -244,6 +244,7 @@ where $\delta$ is the time to save one checkpoint.
 Real-world questions often require chaining multiple solvers. For example, answering "Can I serve Llama-70B on 4x H100s, and what will it cost?" requires the ServingSolver (feasibility and latency) followed by the EconomicsSolver (per-query cost). Similarly, "What is the most sustainable way to train GPT-3?" chains the DistributedSolver (optimal parallelism) with the SustainabilitySolver (carbon by region).
 ```{mermaid}
 %%{init: {'theme': 'neutral'}}%%
 %%| fig-cap: "Solver composition for compound questions. Each solver's output feeds the next, enabling multi-dimensional analysis."
 %%| fig-width: 100%
 flowchart LR
--- a/mlsysim/docs/zoo/composition.svg
+++ b/mlsysim/docs/zoo/composition.svg
@@ -0,0 +1,88 @@
 <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 600 500" width="100%" height="100%" style="background-color: white; font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, Helvetica, Arial, sans-serif;">
  <defs>
    <marker id="arrowhead" markerWidth="10" markerHeight="7" refX="9" refY="3.5" orient="auto">
      <polygon points="0 0, 10 3.5, 0 7" fill="#64748b" />
    </marker>
    <style>
      .node-rect { stroke-width: 1.5; rx: 6; ry: 6; }
      .node-text-title { font-size: 14px; font-weight: 600; fill: #0f172a; text-anchor: middle; }
      .node-text-sub { font-size: 11px; fill: #475569; text-anchor: middle; }
      .edge-line { stroke: #94a3b8; stroke-width: 1.5; fill: none; }
      .edge-label { font-size: 11px; fill: #64748b; text-anchor: middle; font-weight: 500; }
      .bg-rect { fill: white; }
    </style>
  </defs>
  <!-- HardwareNode -->
  <g transform="translate(240, 40)">
    <rect width="120" height="50" class="node-rect" fill="#e0f2fe" stroke="#38bdf8" />
    <text x="60" y="22" class="node-text-title">HardwareNode</text>
    <text x="60" y="38" class="node-text-sub">Silicon / Chip</text>
  </g>
  <!-- Edge: HardwareNode to Node -->
  <path d="M 300 90 L 300 130" class="edge-line" marker-end="url(#arrowhead)" />
  <rect x="260" y="102" width="80" height="16" fill="white" />
  <text x="300" y="114" class="edge-label">accelerates</text>
  <!-- Node -->
  <g transform="translate(240, 140)">
    <rect width="120" height="50" class="node-rect" fill="#f8fafc" stroke="#cbd5e1" />
    <text x="60" y="22" class="node-text-title">Node</text>
    <text x="60" y="38" class="node-text-sub">Server Chassis</text>
  </g>
  <!-- NetworkFabric -->
  <g transform="translate(80, 140)">
    <rect width="120" height="50" class="node-rect" fill="#f8fafc" stroke="#cbd5e1" />
    <text x="60" y="22" class="node-text-title">NetworkFabric</text>
    <text x="60" y="38" class="node-text-sub">Interconnect</text>
  </g>
  <!-- Edge: Node to Fleet -->
  <path d="M 300 190 L 300 240" class="edge-line" marker-end="url(#arrowhead)" />
  <rect x="265" y="207" width="70" height="16" fill="white" />
  <text x="300" y="219" class="edge-label">composes</text>
  <!-- Edge: NetworkFabric to Fleet -->
  <path d="M 140 190 C 140 215, 230 215, 230 245" class="edge-line" marker-end="url(#arrowhead)" />
  <rect x="145" y="215" width="60" height="16" fill="white" />
  <text x="175" y="227" class="edge-label">connects</text>
  <!-- Fleet -->
  <g transform="translate(240, 250)">
    <rect width="120" height="50" class="node-rect" fill="#e0f2fe" stroke="#38bdf8" />
    <text x="60" y="22" class="node-text-title">Fleet</text>
    <text x="60" y="38" class="node-text-sub">Cluster</text>
  </g>
  <!-- GridProfile -->
  <g transform="translate(80, 360)">
    <rect width="120" height="50" class="node-rect" fill="#dcfce7" stroke="#4ade80" />
    <text x="60" y="22" class="node-text-title">GridProfile</text>
    <text x="60" y="38" class="node-text-sub">Regional Power</text>
  </g>
  <!-- Edge: Fleet to Datacenter -->
  <path d="M 300 300 L 300 350" class="edge-line" marker-end="url(#arrowhead)" />
  <rect x="260" y="317" width="80" height="16" fill="white" />
  <text x="300" y="329" class="edge-label">is hosted in</text>
  <!-- Edge: GridProfile to Datacenter -->
  <path d="M 200 385 L 230 385" class="edge-line" marker-end="url(#arrowhead)" />
  <rect x="195" y="377" width="50" height="16" fill="white" />
  <text x="215" y="389" class="edge-label">powers</text>
  <!-- Datacenter -->
  <g transform="translate(240, 360)">
    <rect width="120" height="50" class="node-rect" fill="#f8fafc" stroke="#cbd5e1" />
    <text x="60" y="22" class="node-text-title">Datacenter</text>
    <text x="60" y="38" class="node-text-sub">Physical Facility</text>
  </g>
  <!-- Dashed bounding box for Systems -->
  <rect x="60" y="120" width="320" height="200" fill="none" stroke="#cbd5e1" stroke-width="2" stroke-dasharray="8,4" rx="8" />
  <rect x="70" y="112" width="135" height="16" fill="white" />
  <text x="135" y="124" class="node-text-sub" font-weight="bold">Layer D: Systems</text>
 </svg>
--- a/mlsysim/docs/zoo/fleets.qmd
+++ b/mlsysim/docs/zoo/fleets.qmd
@@ -2,7 +2,6 @@
 title: "The Fleet Zoo"
 subtitle: "Vetted System Archetypes and Multi-Node Clusters"
 ---
 The Fleet Zoo defines the **Structural Context** of ML systems—from single microcontrollers to
 warehouse-scale supercomputers. Fleets combine hardware nodes, network fabric, and a count to
 form a complete system that the `DistributedSolver` can analyze.
@@ -79,7 +78,7 @@ else:
 ### Why Fleet Size Matters
-Distributed training performance is dominated by **communication overhead**. As you add more nodes, each all-reduce synchronization step must transfer gradient data across the fabric. The `DistributedSolver` models this trade-off using the ring all-reduce formula:
+Distributed training performance is dominated by **communication overhead**. As you add more nodes, each all-reduce synchronization step must transfer gradient data across the fabric. The `DistributedSolver` models this trade-off using the ring all-reduce formula [@dean2012large]:
 $$T_{\text{dp}} = 2(N-1) \cdot \left(\frac{M/N}{BW} + L\right)$$
--- a/mlsysim/docs/zoo/hardware.qmd
+++ b/mlsysim/docs/zoo/hardware.qmd
@@ -2,7 +2,6 @@
 title: "The Silicon Zoo"
 subtitle: "Vetted Specifications for AI Accelerators and Edge Devices"
 ---
 The Silicon Zoo is the **Single Source of Truth (SSoT)** for all physical hardware in `mlsysim`.
 Every specification is typed (`pint.Quantity`), provenance-tracked, and validated against official
 datasheets and MLPerf baselines—so you never have to argue about what the A100's bandwidth actually is.
@@ -49,10 +48,10 @@ def print_hardware_table(title, hardware_class):
    print("| Device | Year | Peak Performance | Memory BW | Capacity | TDP |")
    print("|:---|:---:|:---:|:---:|:---:|:---:|")
-    for attr_name in sorted(dir(hardware_class)):
+    # Use the new Registry .list() method for coherent sorting
-        if attr_name.startswith("_"): continue
+    items = hardware_class.list(sort_by='release_year', reverse=True)
-        item = getattr(hardware_class, attr_name)
+    for item in items:
-        if "HardwareNode" in type(item).__name__:
+        if True: # Registry already filtered for us
            flops = auto_scale(item.compute.peak_flops)
            bw = auto_scale(item.memory.bandwidth)
            cap = auto_scale(item.memory.capacity)
@@ -101,4 +100,12 @@ These specifications are used throughout Volumes 1 and 2 of the textbook. The *H
 ---
 ::: {.callout-note}
 ## Missing a device?
 You can define custom hardware specs on-the-fly in Python or contribute new vetted specs to the registry.
 See the [Contributing Guide](../contributing.qmd) for how to add persistent specs, or the
 [Hardware API Reference](../api/hardware.qmd) for defining custom  objects.
 :::
 *Note: For full technical specs and validation details, see the API Reference.*
--- a/mlsysim/docs/zoo/index.qmd
+++ b/mlsysim/docs/zoo/index.qmd
@@ -2,7 +2,6 @@
 title: "The MLSys Zoo"
 subtitle: "A Single Source of Truth for ML Systems Specifications"
 ---
 The MLSys Zoo is a centralized, vetted registry of specifications used throughout
 the `mlsysim` platform. Every entry is strictly typed with `pint.Quantity` for
 dimensional correctness, provenance-tracked, and validated against official sources.
@@ -29,23 +28,19 @@ to every solver and tutorial.
 ---
-## Understanding the 5-Layer Stack
+## System Composition Hierarchy
-The Zoo catalogs map onto the five analytical layers of MLSYSIM:
+ML systems are structurally composed of smaller parts. The `mlsysim` registry reflects this physical reality. Before a workload can be evaluated, the structural components are combined into a coherent system.
-```
+Here is how the components in the Zoo relate to each other:
 [Workloads]       ← Model Zoo (what the algorithm demands)
      ↓
 [Hardware]        ← Silicon Zoo (what the chip supplies)
      ↓
 [Infrastructure]  ← Infrastructure Zoo (the environment it runs in)
      ↓
 [Systems]         ← Fleet Zoo (the structural arrangement)
      ↓
 [Solvers]         ← Engine (lowers demand onto supply, produces profile)
 ```
-Each Zoo catalog is the authoritative input to one layer of the progressive lowering stack.
+![The Physical Composition of ML Systems in MLSYSIM](composition.svg){fig-align="center" width="100%"}
 1. **HardwareNode (Silicon):** The fundamental unit of compute (e.g., an H100 GPU or a DGX Spark GB10 superchip). It provides FLOPs and Memory Bandwidth.
 2. **Node:** A single server chassis. It contains one or more `HardwareNode`s connected by a high-speed intra-node bus (like NVLink).
 3. **NetworkFabric:** The inter-node networking (e.g., InfiniBand NDR or 100GbE) that allows servers to communicate.
 4. **Fleet (Cluster):** A collection of `Node`s connected by a `NetworkFabric`. This is the top-level entity used for distributed training and cluster reliability models.
 5. **Datacenter & GridProfile (Infra/Regions):** The physical facility and regional power grid that hosts the `Fleet`. It dictates the Power Usage Effectiveness (PUE) and the carbon intensity of the electricity consumed.
 ---
--- a/mlsysim/docs/zoo/infra.qmd
+++ b/mlsysim/docs/zoo/infra.qmd
@@ -2,7 +2,6 @@
 title: "The Infrastructure Zoo"
 subtitle: "Regional Grids and Sustainability Baselines"
 ---
 The Infrastructure Zoo provides the **Environmental Context** for ML deployments—the carbon intensity
 of regional electricity grids and datacenter efficiency profiles. Every value is sourced from
 published government energy data and IEA reporting.
--- a/mlsysim/docs/zoo/models.qmd
+++ b/mlsysim/docs/zoo/models.qmd
@@ -2,7 +2,6 @@
 title: "The Model Zoo"
 subtitle: "Reference Workloads for Systems Modeling"
 ---
 The Model Zoo defines the **Computational Demand** placed on the hardware. Every workload is
 pulled from the `mlsysim.Models` registry and characterized by its FLOPs, parameter count, and
 architecture type—independent of any specific hardware.
@@ -97,4 +96,12 @@ The *Model Training* and *Model Serving* chapters use these workload profiles to
 ---
-*Note: For dynamic memory footprint and KV-cache calculations, see the API Reference.*
+*
 ::: {.callout-note}
 ## Add your own model
 Defining custom workloads is straightforward. You can extend the registry or define a
 (or ) object directly in your code.
 Learn more in the [Contributing Guide](../contributing.qmd) and the [Models API Reference](../api/models.qmd).
 :::
 Note: For dynamic memory footprint and KV-cache calculations, see the API Reference.*
--- a/mlsysim/hardware/registry.py
+++ b/mlsysim/hardware/registry.py
@@ -1,4 +1,5 @@
 from .types import HardwareNode, ComputeCore, MemoryHierarchy
 from ..core.registry import Registry
 from ..core.constants import (
    ureg,
    V100_MEM_BW, V100_FLOPS_FP16_TENSOR, V100_MEM_CAPACITY, V100_TDP, V100_FLOPS_FP32,
@@ -10,7 +11,7 @@ from ..core.constants import (
    T4_MEM_BW, T4_FLOPS_FP16_TENSOR, T4_TDP, T4_FLOPS_INT8
 )
-class CloudHardware:
+class CloudHardware(Registry):
    """Datacenter-scale accelerators (Volume II)."""
    V100 = HardwareNode(
        name="NVIDIA V100",
@@ -86,8 +87,20 @@ class CloudHardware:
        dispatch_tax=0.03 * ureg.ms
    )
-class WorkstationHardware:
+class WorkstationHardware(Registry):
    """Personal computing systems used for local development."""
    DGX_Spark = HardwareNode(
        name="NVIDIA DGX Spark (GB10)",
        release_year=2024,
        compute=ComputeCore(
            peak_flops=250 * ureg.TFLOPs/ureg.s, 
            precision_flops={"fp8": 500 * ureg.TFLOPs/ureg.s, "fp4": 1000 * ureg.TFLOPs/ureg.s}
        ),
        memory=MemoryHierarchy(capacity=128 * ureg.GB, bandwidth=500 * ureg.GB/ureg.s),
        tdp=250 * ureg.W,
        dispatch_tax=0.01 * ureg.ms
    )
    MacBookM3Max = HardwareNode(
        name="MacBook Pro (M3 Max)",
        release_year=2023,
@@ -97,7 +110,7 @@ class WorkstationHardware:
        dispatch_tax=0.05 * ureg.ms
    )
-class MobileHardware:
+class MobileHardware(Registry):
    """Smartphone and handheld devices (Volume I)."""
    iPhone15Pro = HardwareNode(
        name="iPhone 15 Pro (A17 Pro)",
@@ -127,7 +140,7 @@ class MobileHardware:
        dispatch_tax=1.5 * ureg.ms
    )
-class EdgeHardware:
+class EdgeHardware(Registry):
    """Robotics and Industrial Edge (Volume I)."""
    JetsonOrinNX = HardwareNode(
        name="NVIDIA Jetson Orin NX",
@@ -165,7 +178,7 @@ class EdgeHardware:
        dispatch_tax=0.1 * ureg.ms
    )
-class TinyHardware:
+class TinyHardware(Registry):
    """Microcontrollers and sub-watt devices."""
    ESP32_S3 = HardwareNode(
        name="ESP32-S3 (AI)",
@@ -186,7 +199,7 @@ class TinyHardware:
        dispatch_tax=2.0 * ureg.ms
    )
-class Hardware:
+class Hardware(Registry):
    Cloud = CloudHardware
    Workstation = WorkstationHardware
    Mobile = MobileHardware
@@ -203,6 +216,9 @@ class Hardware:
    TPUv5p = CloudHardware.TPUv5p
    T4 = CloudHardware.T4
    DGXSpark = WorkstationHardware.DGX_Spark
    MacBook = WorkstationHardware.MacBookM3Max
    iPhone = MobileHardware.iPhone15Pro
    Snapdragon = MobileHardware.Snapdragon8Gen3
    Jetson = EdgeHardware.JetsonOrinNX
--- a/mlsysim/models/registry.py
+++ b/mlsysim/models/registry.py
@@ -1,4 +1,6 @@
 from .types import TransformerWorkload, CNNWorkload, Workload
 from ..core.registry import Registry
 from .types import TransformerWorkload, CNNWorkload, Workload
 from ..core.constants import (
    ureg,
    GPT2_PARAMS, GPT3_PARAMS, GPT4_EST_PARAMS, GPT3_TRAINING_OPS,
@@ -9,7 +11,7 @@ from ..core.constants import (
    ALEXNET_PARAMS, ANOMALY_MODEL_PARAMS, DLRM_MODEL_SIZE_FP32
 )
-class LanguageModels:
+class LanguageModels(Registry):
    GPT2 = TransformerWorkload(
        name="GPT-2 (1.5B)",
        architecture="Transformer",
@@ -77,7 +79,7 @@ class LanguageModels:
        inference_flops=2 * LLAMA3_70B_PARAMS.magnitude * ureg.flop
    )
-class VisionModels:
+class VisionModels(Registry):
    ResNet50 = CNNWorkload(
        name="ResNet-50",
        architecture="CNN",
@@ -107,7 +109,7 @@ class VisionModels:
        layers=8
    )
-class TinyModels:
+class TinyModels(Registry):
    DS_CNN = CNNWorkload(
        name="DS-CNN (KWS)",
        architecture="CNN",
@@ -126,7 +128,7 @@ class TinyModels:
        # Generic Workload doesn't have params in type, but we can override
    )
-class RecommendationModels:
+class RecommendationModels(Registry):
    # Special class for DLRM as it's defined by size
    DLRM = Workload(
        name="DLRM",
@@ -136,7 +138,7 @@ class RecommendationModels:
    # Note: We'll add specialized size methods if needed, 
    # but for now we maintain string compatibility.
-class Models:
+class Models(Registry):
    Language = LanguageModels
    Vision = VisionModels
    Tiny = TinyModels
--- a/mlsysim/models/types.py
+++ b/mlsysim/models/types.py
@@ -127,6 +127,25 @@ class TransformerWorkload(Workload):
            layers=self.layers
        )
 class SparseTransformerWorkload(TransformerWorkload):
    active_parameters: Quantity
    experts: int
    active_experts_per_token: int = 1
    def lower(self, precision: Quantity = BYTES_FP16) -> ComputationGraph:
        # For MoE, total parameters define the memory footprint,
        # but active parameters define the computation flops.
        ops = self.inference_flops or (2 * self.active_parameters.to(ureg.count).magnitude * ureg.flop)
        weights = self.size_in_bytes(precision) # uses self.parameters (total params)
        return ComputationGraph(
            name=self.name,
            total_ops=ops,
            parameter_count=self.parameters,
            weight_bytes=weights,
            arithmetic_intensity=(ops / weights).to("flop/byte"),
            layers=self.layers
        )
 class CNNWorkload(Workload):
    parameters: Quantity
    inference_flops: Quantity
--- a/mlsysim/viz/plots.py
+++ b/mlsysim/viz/plots.py
@@ -56,38 +56,178 @@ def setup_plot(figsize=(8, 5)):
 def plot_roofline(hardware_node, workloads=None):
    """
-    Plots a standard Roofline Model for a given HardwareNode.
+    Plots a publication-quality Roofline Model for a given HardwareNode.
-    Follows the LEGO-style visualization pattern.
+
    Features:
    - Ridge point annotated with numeric value
    - Memory-bound and compute-bound regions shaded and labeled
    - Memory bandwidth ceiling (diagonal) and compute ceiling (flat)
    - Workloads plotted with bottleneck classification
    """
    # 1. PARAMETERS
-    peak_flops = hardware_node.compute.peak_flops.to('TFLOPs/s').magnitude
+    peak_flops = hardware_node.compute.peak_flops.to("TFLOPs/s").magnitude
-    peak_bw = hardware_node.memory.bandwidth.to('GB/s').magnitude
+    peak_bw = hardware_node.memory.bandwidth.to("GB/s").magnitude
-    
+    ridge_point = peak_flops / (peak_bw / 1000)  # FLOP/Byte
-    # 2. INVARIANTS
+
-    x_intensities = np.logspace(-1, 4, 100)
+    # 2. AXIS RANGE
-    
+    x_min, x_max = 0.1, 10000
-    # 3. CALCULATION
+    x = np.logspace(np.log10(x_min), np.log10(x_max), 500)
-    y_memory_bound = peak_bw * x_intensities / 1000 # TFLOPs equivalent
+
-    y_compute_bound = np.full_like(x_intensities, peak_flops)
+    # 3. ROOFLINE CURVES
-    y_roofline = np.minimum(y_memory_bound, y_compute_bound)
+    y_mem = peak_bw * x / 1000  # BW * AI, converted to TFLOP/s
-    
+    y_compute = np.full_like(x, peak_flops)
-    # 4. OUTPUT (Visualization)
+    y_roof = np.minimum(y_mem, y_compute)
-    fig, ax, colors, plt = setup_plot()
+
-    ax.loglog(x_intensities, y_roofline, color=colors['BlueLine'], linewidth=2.5, label=f'{hardware_node.name} Roofline')
+    # 4. PLOT
-    ax.fill_between(x_intensities, 0, y_roofline, color=colors['BlueFill'], alpha=0.3)
+    fig, ax, colors, _ = setup_plot(figsize=(9, 5.5))
-    
+
    # Shaded regions
    mem_mask = x <= ridge_point
    comp_mask = x >= ridge_point
    ax.fill_between(
        x[mem_mask],
        y_roof[mem_mask] * 0.001,
        y_roof[mem_mask],
        color=colors["OrangeL"],
        alpha=0.5,
        label="Memory-bound region",
    )
    ax.fill_between(
        x[comp_mask],
        y_roof[comp_mask] * 0.001,
        y_roof[comp_mask],
        color=colors["BlueFill"],
        alpha=0.5,
        label="Compute-bound region",
    )
    # Roofline line
    ax.loglog(
        x,
        y_roof,
        color=colors["BlueLine"],
        linewidth=2.5,
        zorder=5,
    )
    # Memory bandwidth ceiling label (on the slope)
    slope_x = ridge_point * 0.08
    slope_y = peak_bw * slope_x / 1000
    ax.text(
        slope_x,
        slope_y * 1.6,
        f"BW ceiling: {peak_bw:.0f} GB/s",
        color=colors["OrangeLine"],
        fontsize=8.5,
        fontweight="bold",
        rotation=38,
        ha="center",
        va="bottom",
    )
    # Compute ceiling label (on the flat)
    ax.text(
        ridge_point * 8,
        peak_flops * 1.12,
        f"Compute ceiling: {peak_flops:.0f} TFLOP/s",
        color=colors["BlueLine"],
        fontsize=8.5,
        fontweight="bold",
        ha="center",
        va="bottom",
    )
    # Ridge point
    ax.plot(
        ridge_point,
        peak_flops,
        "D",
        color=colors["crimson"],
        markersize=9,
        zorder=10,
    )
    ax.annotate(
        f"Ridge Point\n{ridge_point:.1f} FLOP/Byte",
        xy=(ridge_point, peak_flops),
        xytext=(ridge_point * 3, peak_flops * 0.35),
        fontsize=8.5,
        fontweight="bold",
        color=colors["crimson"],
        ha="center",
        arrowprops=dict(
            arrowstyle="->",
            color=colors["crimson"],
            lw=1.2,
        ),
    )
    # Vertical dashed line at ridge point
    ax.axvline(
        ridge_point,
        color=colors["crimson"],
        linestyle=":",
        linewidth=0.8,
        alpha=0.5,
    )
    # Region labels
    ax.text(
        x_min * 1.5,
        peak_flops * 0.6,
        "MEMORY\nBOUND",
        color=colors["OrangeLine"],
        fontsize=11,
        fontweight="bold",
        alpha=0.25,
        ha="left",
        va="center",
    )
    ax.text(
        x_max * 0.4,
        peak_flops * 0.6,
        "COMPUTE\nBOUND",
        color=colors["BlueLine"],
        fontsize=11,
        fontweight="bold",
        alpha=0.25,
        ha="right",
        va="center",
    )
    # Plot workloads
    if workloads:
        from ..core.engine import Engine
        for model in workloads:
            profile = Engine.solve(model, hardware_node, efficiency=1.0)
            intensity = profile.arithmetic_intensity.magnitude
            theoretical_perf = min(peak_bw * intensity / 1000, peak_flops)
            ax.plot(intensity, theoretical_perf, 'o', color=colors['crimson'], markersize=8)
            ax.text(intensity * 1.2, theoretical_perf, model.name, color=colors['crimson'], fontsize=9, fontweight='bold')
-    ax.set_xlabel('Arithmetic Intensity (FLOP/Byte)')
+        workload_colors = [
-    ax.set_ylabel('Performance (TFLOPs/s)')
+            colors["crimson"],
-    ax.set_title(f'Roofline: {hardware_node.name}')
+            colors["GreenLine"],
            colors["VioletLine"],
            colors["BrownLine"],
        ]
        for i, model in enumerate(workloads):
            profile = Engine.solve(model, hardware_node, efficiency=1.0)
            ai = profile.arithmetic_intensity.magnitude
            perf = min(peak_bw * ai / 1000, peak_flops)
            c = workload_colors[i % len(workload_colors)]
            bound = "memory" if ai < ridge_point else "compute"
            ax.plot(ai, perf, "o", color=c, markersize=9, zorder=10)
            ax.annotate(
                f"{model.name}\n({bound}-bound)",
                xy=(ai, perf),
                xytext=(ai * 0.3, perf * 0.4),
                fontsize=8,
                fontweight="bold",
                color=c,
                ha="center",
                arrowprops=dict(arrowstyle="->", color=c, lw=1),
            )
    ax.set_xlabel("Arithmetic Intensity (FLOP/Byte)")
    ax.set_ylabel("Performance (TFLOP/s)")
    ax.set_title(f"Roofline: {hardware_node.name}")
    ax.set_xlim(x_min, x_max)
    ax.set_ylim(peak_flops * 0.001, peak_flops * 2)
    ax.legend(loc="lower right", fontsize=8, framealpha=0.9)
    return fig, ax
 def plot_evaluation_scorecard(evaluation):