mirror of
https://github.com/harvard-edge/cs249r_book.git
synced 2026-04-30 17:48:27 -05:00
docs: clean up landing page and centralize math foundations
- Elevate 5-Layer Progressive Lowering mental model to architecture.qmd - Clean up landing page copy to be a punchy one-liner - Re-render architecture composition diagram as SVG for reliability - Move math derivations out of tutorials and into math.qmd with citations - Add DGX Spark to Silicon Zoo
This commit is contained in:
@@ -204,6 +204,30 @@ def calc_tree_allreduce_time(message_bytes, n_gpus, bandwidth_bytes_s, latency_s
|
|||||||
return (bw_term + lat_term).to(ureg.second)
|
return (bw_term + lat_term).to(ureg.second)
|
||||||
|
|
||||||
|
|
||||||
|
def calc_all_to_all_time(message_bytes, n_gpus, bandwidth_bytes_s, latency_s):
|
||||||
|
"""
|
||||||
|
All-to-All communication time estimate (typical for MoE token routing).
|
||||||
|
|
||||||
|
T = (N-1)/N × M/β + (N-1) × α
|
||||||
|
|
||||||
|
Args:
|
||||||
|
message_bytes: Total message size in bytes (M) per node
|
||||||
|
n_gpus: Number of GPUs (N)
|
||||||
|
bandwidth_bytes_s: Per-link bandwidth in bytes/second (β)
|
||||||
|
latency_s: Per-message startup latency in seconds (α)
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Quantity[second]: Estimated All-to-All time
|
||||||
|
"""
|
||||||
|
msg = _ensure_unit(message_bytes, ureg.byte)
|
||||||
|
bw = _ensure_unit(bandwidth_bytes_s, ureg.byte / ureg.second)
|
||||||
|
lat = _ensure_unit(latency_s, ureg.second)
|
||||||
|
n = n_gpus
|
||||||
|
bw_term = (n - 1) / n * msg / bw
|
||||||
|
lat_term = (n - 1) * lat
|
||||||
|
return (bw_term + lat_term).to(ureg.second)
|
||||||
|
|
||||||
|
|
||||||
def calc_transformer_training_flops(n_params, n_tokens):
|
def calc_transformer_training_flops(n_params, n_tokens):
|
||||||
"""
|
"""
|
||||||
Estimate total training FLOPs for a Transformer model (6PD rule).
|
Estimate total training FLOPs for a Transformer model (6PD rule).
|
||||||
@@ -359,20 +383,21 @@ def calc_mtbf_node(gpu_mtbf_h, n_gpus, nic_mtbf_h, n_nics,
|
|||||||
return (1.0 / rate).to(ureg.hour)
|
return (1.0 / rate).to(ureg.hour)
|
||||||
|
|
||||||
|
|
||||||
def calc_pipeline_bubble(n_stages, n_microbatches):
|
def calc_pipeline_bubble(n_stages, n_microbatches, v_stages=1):
|
||||||
"""
|
"""
|
||||||
Pipeline bubble fraction (GPipe / 1F1B).
|
Pipeline bubble fraction (GPipe / 1F1B / Interleaved 1F1B).
|
||||||
|
|
||||||
bubble = (P - 1) / (P - 1 + M)
|
bubble = (P - 1) / (V * M + P - 1)
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
n_stages: Number of pipeline stages (P)
|
n_stages: Number of pipeline stages (P)
|
||||||
n_microbatches: Number of microbatches (M)
|
n_microbatches: Number of microbatches (M)
|
||||||
|
v_stages: Number of virtual stages per GPU (V, default 1)
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
Bubble fraction (0.0 to 1.0)
|
Bubble fraction (0.0 to 1.0)
|
||||||
"""
|
"""
|
||||||
return (n_stages - 1) / (n_stages - 1 + n_microbatches)
|
return (n_stages - 1) / (v_stages * n_microbatches + n_stages - 1)
|
||||||
|
|
||||||
|
|
||||||
def calc_checkpoint_size(n_params, bytes_per_param=16):
|
def calc_checkpoint_size(n_params, bytes_per_param=16):
|
||||||
|
|||||||
@@ -6,6 +6,7 @@ from .formulas import (
|
|||||||
calc_ring_allreduce_time,
|
calc_ring_allreduce_time,
|
||||||
calc_tree_allreduce_time,
|
calc_tree_allreduce_time,
|
||||||
calc_hierarchical_allreduce_time,
|
calc_hierarchical_allreduce_time,
|
||||||
|
calc_all_to_all_time,
|
||||||
calc_mtbf_cluster,
|
calc_mtbf_cluster,
|
||||||
calc_young_daly_interval,
|
calc_young_daly_interval,
|
||||||
calc_failure_probability,
|
calc_failure_probability,
|
||||||
@@ -64,10 +65,12 @@ class DistributedSolver(BaseSolver):
|
|||||||
efficiency: float = 0.5,
|
efficiency: float = 0.5,
|
||||||
tp_size: int = 1,
|
tp_size: int = 1,
|
||||||
pp_size: int = 1,
|
pp_size: int = 1,
|
||||||
|
ep_size: int = 1,
|
||||||
|
v_stages: int = 1,
|
||||||
microbatch_count: int = 1,
|
microbatch_count: int = 1,
|
||||||
topology_override: Optional[str] = None) -> Dict[str, Any]:
|
topology_override: Optional[str] = None) -> Dict[str, Any]:
|
||||||
"""
|
"""
|
||||||
Calculates distributed training performance using the 3D Parallelism model.
|
Calculates distributed training performance using the 3D/4D Parallelism model.
|
||||||
|
|
||||||
Parameters
|
Parameters
|
||||||
----------
|
----------
|
||||||
@@ -87,6 +90,11 @@ class DistributedSolver(BaseSolver):
|
|||||||
pp_size : int
|
pp_size : int
|
||||||
Pipeline Parallelism degree. Chains model layers across multiple
|
Pipeline Parallelism degree. Chains model layers across multiple
|
||||||
nodes, introducing 'pipeline bubbles' while saving memory.
|
nodes, introducing 'pipeline bubbles' while saving memory.
|
||||||
|
ep_size : int
|
||||||
|
Expert Parallelism degree for MoE models. Introduces All-to-All
|
||||||
|
communication overhead across nodes.
|
||||||
|
v_stages : int
|
||||||
|
Number of virtual stages for interleaved pipeline schedules.
|
||||||
microbatch_count : int
|
microbatch_count : int
|
||||||
Number of microbatches (M). Increasing M reduces the pipeline
|
Number of microbatches (M). Increasing M reduces the pipeline
|
||||||
bubble but increases synchronization overhead.
|
bubble but increases synchronization overhead.
|
||||||
@@ -96,15 +104,15 @@ class DistributedSolver(BaseSolver):
|
|||||||
Returns
|
Returns
|
||||||
-------
|
-------
|
||||||
Dict[str, Any]
|
Dict[str, Any]
|
||||||
Metrics including DP/TP latency, the Pipeline Bubble penalty,
|
Metrics including DP/TP/EP latency, the Pipeline Bubble penalty,
|
||||||
and the final Scaling Efficiency.
|
and the final Scaling Efficiency.
|
||||||
"""
|
"""
|
||||||
# 1. 3D Parallelism Decomposition
|
# 1. 3D/4D Parallelism Decomposition
|
||||||
n_accelerators = fleet.total_accelerators
|
n_accelerators = fleet.total_accelerators
|
||||||
dp_size = n_accelerators // (tp_size * pp_size)
|
dp_size = n_accelerators // (tp_size * pp_size * ep_size)
|
||||||
|
|
||||||
if dp_size < 1:
|
if dp_size < 1:
|
||||||
raise ValueError(f"Infeasible 3D Parallelism: TP({tp_size}) * PP({pp_size}) > Total({n_accelerators})")
|
raise ValueError(f"Infeasible 4D Parallelism: TP({tp_size}) * PP({pp_size}) * EP({ep_size}) > Total({n_accelerators})")
|
||||||
|
|
||||||
# 2. Single Node Performance (Computation)
|
# 2. Single Node Performance (Computation)
|
||||||
node_perf = Engine.solve(model, fleet.node.accelerator, batch_size=batch_size // dp_size, precision=precision, efficiency=efficiency)
|
node_perf = Engine.solve(model, fleet.node.accelerator, batch_size=batch_size // dp_size, precision=precision, efficiency=efficiency)
|
||||||
@@ -139,13 +147,25 @@ class DistributedSolver(BaseSolver):
|
|||||||
# TP Communication (Assuming intra-node NVLink)
|
# TP Communication (Assuming intra-node NVLink)
|
||||||
t_comm_tp = (message_size / tp_size / fleet.node.intra_node_bw).to("ms") if tp_size > 1 else Q_("0 ms")
|
t_comm_tp = (message_size / tp_size / fleet.node.intra_node_bw).to("ms") if tp_size > 1 else Q_("0 ms")
|
||||||
|
|
||||||
|
# EP Communication (All-to-All token routing for MoE)
|
||||||
|
if ep_size > 1:
|
||||||
|
t_comm_ep = calc_all_to_all_time(
|
||||||
|
message_bytes=message_size,
|
||||||
|
n_gpus=ep_size,
|
||||||
|
bandwidth_bytes_s=fleet.fabric.bandwidth / fleet.fabric.oversubscription_ratio,
|
||||||
|
latency_s=fleet.fabric.latency or Q_("5 us")
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
t_comm_ep = Q_("0 ms")
|
||||||
|
|
||||||
# 4. Pipeline Parallelism (PP) Bubble
|
# 4. Pipeline Parallelism (PP) Bubble
|
||||||
# Source: Narayanan et al. (2019), "PipePipe: Efficient Pipeline Parallelism"
|
# Source: Narayanan et al. (2019), "PipePipe: Efficient Pipeline Parallelism"
|
||||||
bubble_fraction = calc_pipeline_bubble(pp_size, microbatch_count)
|
# Supports interleaved 1F1B schedules via v_stages
|
||||||
|
bubble_fraction = calc_pipeline_bubble(pp_size, microbatch_count, v_stages=v_stages)
|
||||||
t_bubble = (node_perf.latency * bubble_fraction) if pp_size > 1 else Q_("0 ms")
|
t_bubble = (node_perf.latency * bubble_fraction) if pp_size > 1 else Q_("0 ms")
|
||||||
|
|
||||||
# 5. Total Latency and Scaling Efficiency
|
# 5. Total Latency and Scaling Efficiency
|
||||||
total_comm_latency = t_comm_dp + t_comm_tp
|
total_comm_latency = t_comm_dp + t_comm_tp + t_comm_ep
|
||||||
step_latency_total = node_perf.latency + total_comm_latency + t_bubble
|
step_latency_total = node_perf.latency + total_comm_latency + t_bubble
|
||||||
|
|
||||||
scaling_efficiency = (node_perf.latency / step_latency_total).magnitude
|
scaling_efficiency = (node_perf.latency / step_latency_total).magnitude
|
||||||
@@ -154,13 +174,14 @@ class DistributedSolver(BaseSolver):
|
|||||||
"node_performance": node_perf,
|
"node_performance": node_perf,
|
||||||
"dp_communication_latency": t_comm_dp,
|
"dp_communication_latency": t_comm_dp,
|
||||||
"tp_communication_latency": t_comm_tp,
|
"tp_communication_latency": t_comm_tp,
|
||||||
|
"ep_communication_latency": t_comm_ep,
|
||||||
"communication_latency": total_comm_latency, # Backwards compatibility for tests
|
"communication_latency": total_comm_latency, # Backwards compatibility for tests
|
||||||
"pipeline_bubble_latency": t_bubble,
|
"pipeline_bubble_latency": t_bubble,
|
||||||
"bubble_fraction": bubble_fraction,
|
"bubble_fraction": bubble_fraction,
|
||||||
"step_latency_total": step_latency_total,
|
"step_latency_total": step_latency_total,
|
||||||
"scaling_efficiency": scaling_efficiency,
|
"scaling_efficiency": scaling_efficiency,
|
||||||
"effective_throughput": (n_accelerators * node_perf.throughput * scaling_efficiency),
|
"effective_throughput": (n_accelerators * node_perf.throughput * scaling_efficiency),
|
||||||
"parallelism": {"dp": dp_size, "tp": tp_size, "pp": pp_size}
|
"parallelism": {"dp": dp_size, "tp": tp_size, "pp": pp_size, "ep": ep_size}
|
||||||
}
|
}
|
||||||
|
|
||||||
class ReliabilitySolver(BaseSolver):
|
class ReliabilitySolver(BaseSolver):
|
||||||
|
|||||||
@@ -1,12 +1,8 @@
|
|||||||
---
|
---
|
||||||
title: "Page Not Found"
|
title: "Page Not Found"
|
||||||
sidebar: false
|
sidebar: false
|
||||||
format:
|
page-layout: custom
|
||||||
html:
|
|
||||||
page-layout: custom
|
|
||||||
toc: false
|
|
||||||
---
|
---
|
||||||
|
|
||||||
<div style="min-height: 60vh; display: flex; flex-direction: column; align-items: center; justify-content: center; text-align: center; padding: 4rem 2rem;">
|
<div style="min-height: 60vh; display: flex; flex-direction: column; align-items: center; justify-content: center; text-align: center; padding: 4rem 2rem;">
|
||||||
|
|
||||||
<div style="font-size: 5rem; font-weight: 900; color: #E2E8F0; letter-spacing: -0.04em; line-height: 1; margin-bottom: 1.5rem;">404</div>
|
<div style="font-size: 5rem; font-weight: 900; color: #E2E8F0; letter-spacing: -0.04em; line-height: 1; margin-bottom: 1.5rem;">404</div>
|
||||||
|
|||||||
@@ -103,45 +103,56 @@ website:
|
|||||||
search: true
|
search: true
|
||||||
collapse-level: 1
|
collapse-level: 1
|
||||||
contents:
|
contents:
|
||||||
- getting-started.qmd
|
- section: "Welcome"
|
||||||
- solver-guide.qmd
|
|
||||||
- "---"
|
|
||||||
|
|
||||||
- section: "Tutorials"
|
|
||||||
contents:
|
contents:
|
||||||
|
- getting-started.qmd
|
||||||
|
- for-students.qmd
|
||||||
|
- for-instructors.qmd
|
||||||
|
- for-engineers.qmd
|
||||||
|
|
||||||
|
- section: "Using MLSYSIM"
|
||||||
|
contents:
|
||||||
|
- solver-guide.qmd
|
||||||
- tutorials/hello_world.qmd
|
- tutorials/hello_world.qmd
|
||||||
|
- tutorials/sustainability.qmd
|
||||||
- tutorials/llm_serving.qmd
|
- tutorials/llm_serving.qmd
|
||||||
- tutorials/distributed.qmd
|
- tutorials/distributed.qmd
|
||||||
- tutorials/sustainability.qmd
|
|
||||||
|
|
||||||
- section: "Catalogs"
|
- section: "The MLSys Zoo"
|
||||||
href: zoo/index.qmd
|
href: zoo/index.qmd
|
||||||
contents:
|
contents:
|
||||||
- zoo/hardware.qmd
|
- zoo/hardware.qmd
|
||||||
- zoo/models.qmd
|
- zoo/models.qmd
|
||||||
- zoo/fleets.qmd
|
- zoo/fleets.qmd
|
||||||
- zoo/infra.qmd
|
- zoo/infra.qmd
|
||||||
- "---"
|
|
||||||
|
|
||||||
- math.qmd
|
- section: "Foundations"
|
||||||
- glossary.qmd
|
contents:
|
||||||
- accuracy.qmd
|
- architecture.qmd
|
||||||
- "---"
|
- math.qmd
|
||||||
|
- glossary.qmd
|
||||||
|
- accuracy.qmd
|
||||||
|
|
||||||
- text: "Whitepaper"
|
- section: "About"
|
||||||
href: whitepaper.qmd
|
contents:
|
||||||
- contributing.qmd
|
- whitepaper.qmd
|
||||||
- "---"
|
- contributing.qmd
|
||||||
|
|
||||||
- section: "API"
|
- section: "API Reference"
|
||||||
href: api/index.qmd
|
href: api/index.qmd
|
||||||
contents:
|
contents:
|
||||||
- api/hardware.qmd
|
- text: "Hardware"
|
||||||
- api/models.qmd
|
href: api/hardware.qmd
|
||||||
- api/systems.qmd
|
- text: "Models"
|
||||||
- api/infra.qmd
|
href: api/models.qmd
|
||||||
- api/core.qmd
|
- text: "Systems"
|
||||||
- api/core.solver.qmd
|
href: api/systems.qmd
|
||||||
|
- text: "Infrastructure"
|
||||||
|
href: api/infra.qmd
|
||||||
|
- text: "Core"
|
||||||
|
href: api/core.qmd
|
||||||
|
- text: "Solvers"
|
||||||
|
href: api/core.solver.qmd
|
||||||
|
|
||||||
# Footer — ecosystem pattern (matches Kits)
|
# Footer — ecosystem pattern (matches Kits)
|
||||||
page-footer:
|
page-footer:
|
||||||
@@ -169,7 +180,7 @@ format:
|
|||||||
respect-user-color-scheme: true
|
respect-user-color-scheme: true
|
||||||
css: styles/landing.css
|
css: styles/landing.css
|
||||||
toc: true
|
toc: true
|
||||||
toc-depth: 3
|
toc-depth: 4
|
||||||
toc-title: "On this page"
|
toc-title: "On this page"
|
||||||
number-sections: false
|
number-sections: false
|
||||||
code-copy: true
|
code-copy: true
|
||||||
|
|||||||
@@ -2,7 +2,6 @@
|
|||||||
title: "Model Accuracy & Validation"
|
title: "Model Accuracy & Validation"
|
||||||
subtitle: "How well do MLSYSIM predictions match measured hardware performance?"
|
subtitle: "How well do MLSYSIM predictions match measured hardware performance?"
|
||||||
---
|
---
|
||||||
|
|
||||||
MLSYSIM is a **first-order analytical model** — it predicts performance from analytical equations,
|
MLSYSIM is a **first-order analytical model** — it predicts performance from analytical equations,
|
||||||
not from empirical measurements. This page documents where those predictions are accurate,
|
not from empirical measurements. This page documents where those predictions are accurate,
|
||||||
where they diverge, and why.
|
where they diverge, and why.
|
||||||
|
|||||||
@@ -2,7 +2,6 @@
|
|||||||
title: "hardware"
|
title: "hardware"
|
||||||
subtitle: "Hardware specifications and device registry"
|
subtitle: "Hardware specifications and device registry"
|
||||||
---
|
---
|
||||||
|
|
||||||
```python
|
```python
|
||||||
import mlsysim
|
import mlsysim
|
||||||
from mlsysim.hardware.types import ComputeCore, MemoryHierarchy, HardwareNode
|
from mlsysim.hardware.types import ComputeCore, MemoryHierarchy, HardwareNode
|
||||||
|
|||||||
@@ -2,7 +2,6 @@
|
|||||||
title: "API Reference"
|
title: "API Reference"
|
||||||
subtitle: "The 5-Layer MLSYSIM Stack: from Silicon to Sustainability"
|
subtitle: "The 5-Layer MLSYSIM Stack: from Silicon to Sustainability"
|
||||||
---
|
---
|
||||||
|
|
||||||
MLSYSIM is a pedagogical simulation platform for reasoning about ML systems trade-offs across the full stack. Every number is unit-typed via [Pint](https://pint.readthedocs.io), every specification is sourced from vendor datasheets, and every solver implements a closed-form analytical model -- no black-box benchmarks.
|
MLSYSIM is a pedagogical simulation platform for reasoning about ML systems trade-offs across the full stack. Every number is unit-typed via [Pint](https://pint.readthedocs.io), every specification is sourced from vendor datasheets, and every solver implements a closed-form analytical model -- no black-box benchmarks.
|
||||||
|
|
||||||
## Architecture
|
## Architecture
|
||||||
|
|||||||
88
mlsysim/docs/architecture.qmd
Normal file
88
mlsysim/docs/architecture.qmd
Normal file
@@ -0,0 +1,88 @@
|
|||||||
|
---
|
||||||
|
title: "The 5-Layer Architecture"
|
||||||
|
subtitle: "The Mental Model of Progressive Lowering"
|
||||||
|
---
|
||||||
|
The core philosophy of MLSYSIM is **Progressive Lowering**. Rather than treating machine learning systems as black boxes, MLSYSIM organizes the domain into five composable layers.
|
||||||
|
|
||||||
|
Abstract workload *demand* (Layer A) is progressively mapped onto concrete hardware *supply* (Layers B, C, D) through analytical *solvers* (Layer E). Understanding this stack is the key to mastering both this library and the textbook it accompanies.
|
||||||
|
|
||||||
|
## The Stack Diagram
|
||||||
|
|
||||||
|
```{mermaid}
|
||||||
|
%%{init: {'theme': 'neutral'}}%%
|
||||||
|
%%| fig-cap: "The MLSYSIM 5-Layer Stack. Workloads (demand) are lowered onto Hardware (supply) through Infrastructure and Systems layers. Solvers bridge demand and supply to produce analytical profiles."
|
||||||
|
%%| fig-width: 100%
|
||||||
|
flowchart TB
|
||||||
|
A["<b>Layer A: Workloads (Demand)</b><br/>TransformerWorkload, CNNWorkload<br/><i>Parameters, FLOPs, Arithmetic Intensity</i>"]
|
||||||
|
B["<b>Layer B: Hardware (Silicon)</b><br/>HardwareNode, ComputeCore, MemoryHierarchy<br/><i>Peak FLOP/s, Bandwidth, Capacity, TDP</i>"]
|
||||||
|
C["<b>Layer C: Infrastructure (Environment)</b><br/>GridProfile, Datacenter<br/><i>Carbon Intensity, PUE, WUE</i>"]
|
||||||
|
D["<b>Layer D: Systems (Topology)</b><br/>Node, Fleet, NetworkFabric<br/><i>Topology, Accelerators/Node, Fabric BW</i>"]
|
||||||
|
E["<b>Layer E: Solvers (Analysis)</b><br/>SingleNode · Distributed · Serving<br/>Economics · Sustainability · Reliability"]
|
||||||
|
F["<b>Results</b><br/>PerformanceProfile"]
|
||||||
|
|
||||||
|
A --> E
|
||||||
|
B --> D
|
||||||
|
C --> D
|
||||||
|
D --> E
|
||||||
|
E --> F
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 1. Layer A: Workloads (Demand)
|
||||||
|
|
||||||
|
A **Workload** is a hardware-agnostic description of computational demand. You don't ask "How fast is Llama-3?", you ask "How many FLOPs and memory bytes does Llama-3 require?"
|
||||||
|
|
||||||
|
In MLSYSIM, `TransformerWorkload` and `CNNWorkload` define these intrinsic properties (parameter count, layer count, sequence length). The crucial step happens when a workload is "lowered" at a specific numerical precision (e.g., FP16 vs INT8). This lowering step determines the **Arithmetic Intensity** (ops/byte) — the ratio that decides whether a model will be compute-bound or memory-bound on physical hardware.
|
||||||
|
|
||||||
|
*See the [Model Zoo](zoo/models.qmd) for vetted workloads.*
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 2. Layer B: Hardware (Supply)
|
||||||
|
|
||||||
|
A **`HardwareNode`** represents a single physical accelerator (like an H100 GPU or an Apple M3 chip). It provides the raw physical supply:
|
||||||
|
|
||||||
|
* **Compute:** Theoretical peak throughput (TFLOP/s) across different precisions (FP32, FP16, INT8).
|
||||||
|
* **Memory:** High Bandwidth Memory (HBM) capacity and transfer speed (TB/s).
|
||||||
|
* **Power:** Thermal Design Power (TDP).
|
||||||
|
|
||||||
|
Every piece of silicon has a "Ridge Point" (Peak FLOPs / Memory Bandwidth). If your Workload's arithmetic intensity is lower than the hardware's ridge point, you are memory-bound.
|
||||||
|
|
||||||
|
*See the [Silicon Zoo](zoo/hardware.qmd) for vetted hardware specs.*
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 3. Layer C: Infrastructure (Environment)
|
||||||
|
|
||||||
|
Hardware doesn't run in a vacuum; it runs in datacenters plugged into regional power grids. The **`GridProfile`** captures this physical context.
|
||||||
|
|
||||||
|
A 1000-watt GPU running in Quebec (hydroelectric power) vs. Poland (coal power) produces vastly different carbon footprints, despite doing the exact same mathematical operations. This layer introduces Power Usage Effectiveness (PUE) and Carbon Intensity to the analytical model.
|
||||||
|
|
||||||
|
*See the [Infrastructure Zoo](zoo/infra.qmd) for regional grid profiles.*
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 4. Layer D: Systems (Topology)
|
||||||
|
|
||||||
|
You cannot train a 100-Billion parameter model on a single GPU. A **`Fleet`** composes individual `HardwareNode`s into a distributed cluster.
|
||||||
|
|
||||||
|
* **`Node`:** Groups accelerators within a physical server chassis (e.g., 8x GPUs).
|
||||||
|
* **`NetworkFabric`:** Specifies how servers talk to each other (e.g., 400 Gbps InfiniBand NDR).
|
||||||
|
|
||||||
|
The way you structure this system determines your communication overhead and your scaling efficiency when you apply 3D/4D Parallelism.
|
||||||
|
|
||||||
|
*See the [Fleet Zoo](zoo/fleets.qmd) for production cluster topologies.*
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 5. Layer E: Solvers (Analysis)
|
||||||
|
|
||||||
|
The previous four layers are just static definitions (nouns). **Solvers** are the engines (verbs) that bridge demand and supply to answer specific questions.
|
||||||
|
|
||||||
|
Each solver implements closed-form equations from classic systems literature:
|
||||||
|
* **`SingleNodeSolver`**: Maps Layer A to Layer B using the Roofline model.
|
||||||
|
* **`DistributedSolver`**: Maps Layer A to Layer D using Ring All-Reduce and Pipeline schedules.
|
||||||
|
* **`SustainabilitySolver`**: Maps Layer D to Layer C using energy physics.
|
||||||
|
|
||||||
|
*See the [Solver Guide](solver-guide.qmd) to learn how to apply these engines.*
|
||||||
@@ -2,7 +2,6 @@
|
|||||||
title: "Contributing to MLSYSIM"
|
title: "Contributing to MLSYSIM"
|
||||||
subtitle: "How to add hardware specs, write tutorials, and grow the MLSys Zoo."
|
subtitle: "How to add hardware specs, write tutorials, and grow the MLSys Zoo."
|
||||||
---
|
---
|
||||||
|
|
||||||
MLSYSIM grows stronger with every new hardware spec, tutorial, and bug report. This guide
|
MLSYSIM grows stronger with every new hardware spec, tutorial, and bug report. This guide
|
||||||
explains how to contribute — whether you are a student who found a discrepancy in a spec,
|
explains how to contribute — whether you are a student who found a discrepancy in a spec,
|
||||||
an instructor who wants to share a teaching scenario, or a practitioner who wants a new
|
an instructor who wants to share a teaching scenario, or a practitioner who wants a new
|
||||||
|
|||||||
154
mlsysim/docs/for-engineers.qmd
Normal file
154
mlsysim/docs/for-engineers.qmd
Normal file
@@ -0,0 +1,154 @@
|
|||||||
|
---
|
||||||
|
title: "For Engineers & Researchers"
|
||||||
|
subtitle: "Back-of-envelope estimates before you provision hardware."
|
||||||
|
---
|
||||||
|
MLSYSIM gives you quick, type-safe analytical estimates for capacity planning, hardware selection, cost modeling, and sustainability analysis — in seconds, from specifications alone.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Why Use Analytical Models?
|
||||||
|
|
||||||
|
Before running expensive benchmarks or provisioning cloud instances, you need directional answers:
|
||||||
|
|
||||||
|
- **Will this model fit in GPU memory?** — Check before renting the GPU
|
||||||
|
- **What's the expected TTFT for my LLM?** — Estimate before building the serving stack
|
||||||
|
- **How many H100s do I actually need?** — Model scaling efficiency before buying the cluster
|
||||||
|
- **What will this cost per year?** — TCO analysis before signing the contract
|
||||||
|
|
||||||
|
MLSYSIM answers these in microseconds using first-order equations. It won't replace profiling, but it tells you *where to start profiling*.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Quick API Usage
|
||||||
|
|
||||||
|
```python
|
||||||
|
import mlsysim
|
||||||
|
from mlsysim import Engine, ServingSolver, DistributedSolver
|
||||||
|
|
||||||
|
# Single-node: Is ResNet-50 memory-bound on A100?
|
||||||
|
profile = Engine.solve(
|
||||||
|
model=mlsysim.Models.ResNet50,
|
||||||
|
hardware=mlsysim.Hardware.Cloud.A100,
|
||||||
|
batch_size=1, precision="fp16"
|
||||||
|
)
|
||||||
|
print(f"{profile.bottleneck}, {profile.latency.to('ms'):~.2f}")
|
||||||
|
|
||||||
|
# LLM serving: What's the TTFT for Llama-3.1-70B on H100?
|
||||||
|
serving = ServingSolver()
|
||||||
|
result = serving.solve(
|
||||||
|
model=mlsysim.Models.Language.Llama3_70B,
|
||||||
|
hardware=mlsysim.Hardware.Cloud.H100,
|
||||||
|
seq_len=4096, batch_size=1
|
||||||
|
)
|
||||||
|
print(f"TTFT: {result['ttft'].to('ms'):~.1f}")
|
||||||
|
print(f"ITL: {result['itl'].to('ms'):~.2f}")
|
||||||
|
print(f"KV-cache: {result['kv_cache_size'].to('GB'):~.1f}")
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Hardware Sweep Pattern
|
||||||
|
|
||||||
|
Compare devices programmatically instead of reading datasheets:
|
||||||
|
|
||||||
|
```python
|
||||||
|
import mlsysim
|
||||||
|
from mlsysim import Engine
|
||||||
|
|
||||||
|
model = mlsysim.Models.ResNet50
|
||||||
|
|
||||||
|
for hw in [mlsysim.Hardware.Cloud.H100,
|
||||||
|
mlsysim.Hardware.Cloud.A100,
|
||||||
|
mlsysim.Hardware.Cloud.T4,
|
||||||
|
mlsysim.Hardware.Edge.JetsonAGX]:
|
||||||
|
p = Engine.solve(model=model, hardware=hw, batch_size=32, precision="fp16")
|
||||||
|
print(f"{hw.name:20s} {p.bottleneck:16s} {p.latency.to('ms'):>8.2f~} {p.throughput:>8.0f} img/s")
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Composing Solvers for Real Questions
|
||||||
|
|
||||||
|
The six solvers are designed to chain:
|
||||||
|
|
||||||
|
### "Can I serve Llama-70B on 4 H100s within budget?"
|
||||||
|
|
||||||
|
```python
|
||||||
|
from mlsysim import ServingSolver, EconomicsSolver
|
||||||
|
|
||||||
|
# Step 1: Does it fit and what's the latency?
|
||||||
|
serving = ServingSolver()
|
||||||
|
result = serving.solve(
|
||||||
|
model=mlsysim.Models.Language.Llama3_70B,
|
||||||
|
hardware=mlsysim.Hardware.Cloud.H100,
|
||||||
|
seq_len=4096, batch_size=1
|
||||||
|
)
|
||||||
|
|
||||||
|
# Step 2: What does that fleet cost?
|
||||||
|
econ = EconomicsSolver()
|
||||||
|
cost = econ.solve(
|
||||||
|
fleet=mlsysim.Systems.Clusters.H100_8,
|
||||||
|
duration_days=365,
|
||||||
|
kwh_price=0.08
|
||||||
|
)
|
||||||
|
print(f"Annual TCO: ${cost['total_tco'].magnitude:,.0f}")
|
||||||
|
```
|
||||||
|
|
||||||
|
### "Where should I train to minimize carbon?"
|
||||||
|
|
||||||
|
```python
|
||||||
|
from mlsysim import SustainabilitySolver
|
||||||
|
|
||||||
|
sustain = SustainabilitySolver()
|
||||||
|
for grid in [mlsysim.Infra.Grids.Quebec, mlsysim.Infra.Grids.US_Average,
|
||||||
|
mlsysim.Infra.Grids.Poland]:
|
||||||
|
r = sustain.solve(
|
||||||
|
fleet=mlsysim.Systems.Clusters.H100_256,
|
||||||
|
duration_days=30,
|
||||||
|
datacenter=grid
|
||||||
|
)
|
||||||
|
print(f"{grid.name:12s} {r['carbon_kg'].to('metric_ton'):>8.1f~}")
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Writing Custom Solvers
|
||||||
|
|
||||||
|
Follow the built-in solver pattern to create your own analysis:
|
||||||
|
|
||||||
|
```python
|
||||||
|
from mlsysim.hardware.types import HardwareNode
|
||||||
|
|
||||||
|
class PowerEfficiencySolver:
|
||||||
|
def solve(self, hardware: HardwareNode) -> dict:
|
||||||
|
flops_per_watt = hardware.compute.peak_flops / hardware.tdp
|
||||||
|
return {
|
||||||
|
"device": hardware.name,
|
||||||
|
"flops_per_watt": flops_per_watt.to("TFLOPs/s/kW"),
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
See [Extending MLSYSIM](solver-guide.qmd#extending-mlsysim) for the full guide.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Type Safety
|
||||||
|
|
||||||
|
All quantities are `pint.Quantity` objects. Unit conversions are explicit, and dimensional errors are caught at runtime:
|
||||||
|
|
||||||
|
```python
|
||||||
|
hw = mlsysim.Hardware.Cloud.A100
|
||||||
|
hw.compute.peak_flops.to("TFLOPs/s") # → 312.0 TFLOPs/s
|
||||||
|
hw.memory.bandwidth.to("TB/s") # → 2.0 TB/s
|
||||||
|
hw.memory.bandwidth.to("FLOP/s") # → DimensionalityError ✓
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Next Steps
|
||||||
|
|
||||||
|
- **[Getting Started](getting-started.qmd)** — Install and run your first analysis
|
||||||
|
- **[Solver Guide](solver-guide.qmd)** — Which solver for which question
|
||||||
|
- **[MLSys Zoo](zoo/index.qmd)** — Browse all available hardware, model, and infrastructure specs
|
||||||
|
- **[API Reference](api/index.qmd)** — Full programmatic API documentation
|
||||||
|
- **[Accuracy & Validation](accuracy.qmd)** — How analytical bounds compare to empirical measurements
|
||||||
96
mlsysim/docs/for-instructors.qmd
Normal file
96
mlsysim/docs/for-instructors.qmd
Normal file
@@ -0,0 +1,96 @@
|
|||||||
|
---
|
||||||
|
title: "For Instructors"
|
||||||
|
subtitle: "Reproducible, hardware-independent exercises for ML systems courses."
|
||||||
|
---
|
||||||
|
MLSYSIM provides a framework for assigning analytically grounded problem sets where every answer is deterministic and reproducible — regardless of what hardware your students have access to.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Why MLSYSIM for Teaching?
|
||||||
|
|
||||||
|
| Challenge | How MLSYSIM Helps |
|
||||||
|
|:----------|:------------------|
|
||||||
|
| Students lack GPU access | All analysis runs on a laptop — no cloud credits needed |
|
||||||
|
| Homework answers vary by hardware | Vetted registry specs produce identical results everywhere |
|
||||||
|
| Hard to grade open-ended systems questions | Analytical solvers give deterministic, verifiable outputs |
|
||||||
|
| Specifications become stale | Registry updated from official datasheets; one update propagates everywhere |
|
||||||
|
| Students memorize without understanding | "Predict first" exercises build genuine intuition |
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Course Integration Patterns
|
||||||
|
|
||||||
|
### Pattern 1: Textbook Companion
|
||||||
|
|
||||||
|
MLSYSIM maps directly to chapters in the [Machine Learning Systems](https://mlsysbook.ai) textbook. Assign tutorials alongside readings:
|
||||||
|
|
||||||
|
| Week | Textbook Chapter | MLSYSIM Assignment |
|
||||||
|
|:-----|:-----------------|:-------------------|
|
||||||
|
| 3 | Hardware Acceleration | [Hello World](tutorials/hello_world.qmd) — Roofline analysis, batch size sweep |
|
||||||
|
| 5 | Model Serving | [LLM Serving](tutorials/llm_serving.qmd) — TTFT/ITL analysis |
|
||||||
|
| 7 | Distributed Training | [Distributed Training](tutorials/distributed.qmd) — 3D parallelism |
|
||||||
|
| 9 | Sustainable AI | [Sustainability Lab](tutorials/sustainability.qmd) — Carbon footprint |
|
||||||
|
| 11 | Compute Infrastructure | [Solver Guide](solver-guide.qmd) — Composing solvers for TCO analysis |
|
||||||
|
|
||||||
|
### Pattern 2: Standalone Labs
|
||||||
|
|
||||||
|
Use individual tutorials as self-contained lab assignments in any systems course. Each tutorial includes exercises with clear expected outputs.
|
||||||
|
|
||||||
|
### Pattern 3: Capstone Projects
|
||||||
|
|
||||||
|
Advanced students can write custom solvers (see [Extending MLSYSIM](solver-guide.qmd#extending-mlsysim)) or compose multiple solvers to answer research-style questions.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Assignment Ideas
|
||||||
|
|
||||||
|
### Homework: Hardware Comparison (30 min)
|
||||||
|
> Using `Engine.solve()`, compare ResNet-50 inference latency on the A100, H100, and Jetson AGX at batch sizes 1, 32, and 256. For each configuration, state whether the workload is memory-bound or compute-bound and explain why the bottleneck changes.
|
||||||
|
|
||||||
|
### Lab: Carbon-Aware Training (45 min)
|
||||||
|
> Using the SustainabilitySolver, calculate the carbon footprint of training GPT-3 on a 256-GPU H100 cluster in Quebec vs. US Average vs. Poland. Produce a table and a 2-paragraph analysis of why location matters.
|
||||||
|
|
||||||
|
### Exam Question: Back-of-Envelope
|
||||||
|
> The NVIDIA H100 has 1,979 TFLOP/s (FP16) and 3.35 TB/s bandwidth. What is the ridge point in FLOP/Byte? If a model has arithmetic intensity of 50 FLOP/Byte, is it compute-bound or memory-bound? Show your work.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Reproducibility Guarantee
|
||||||
|
|
||||||
|
All specifications in the [MLSys Zoo](zoo/index.qmd) are:
|
||||||
|
|
||||||
|
- **Sourced** from official manufacturer datasheets and published benchmarks
|
||||||
|
- **Typed** with `pint.Quantity` for dimensional correctness
|
||||||
|
- **Frozen** per release — `mlsysim==0.1.0` always produces the same answers
|
||||||
|
|
||||||
|
This means your answer key works for every student, every semester.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Jupyter & Quarto Compatibility
|
||||||
|
|
||||||
|
All tutorials are designed to run in:
|
||||||
|
|
||||||
|
- **Jupyter Notebooks** — Standard `.ipynb` workflow
|
||||||
|
- **Quarto documents** — Render to HTML, PDF, or slides with `quarto render`
|
||||||
|
- **Google Colab** — `pip install mlsysim` in the first cell, then go
|
||||||
|
|
||||||
|
No GPU runtime required. CPU-only environments work perfectly because MLSYSIM computes from equations, not empirical profiling.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Getting Started
|
||||||
|
|
||||||
|
1. Point students to the [Getting Started](getting-started.qmd) guide for installation
|
||||||
|
2. Assign the [Hello World](tutorials/hello_world.qmd) tutorial as a warmup
|
||||||
|
3. Use the [Solver Guide](solver-guide.qmd) to select solvers for your course topics
|
||||||
|
4. Browse the [MLSys Zoo](zoo/index.qmd) for available hardware and model specifications
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Next Steps
|
||||||
|
|
||||||
|
- **[Solver Guide](solver-guide.qmd)** — Which solver maps to which topic
|
||||||
|
- **[Math Foundations](math.qmd)** — All equations, for your own reference and exam prep
|
||||||
|
- **[Accuracy & Validation](accuracy.qmd)** — How close are analytical estimates to empirical results?
|
||||||
|
- **[Whitepaper](whitepaper.qmd)** — The academic paper describing MLSYSIM's design and pedagogy
|
||||||
93
mlsysim/docs/for-students.qmd
Normal file
93
mlsysim/docs/for-students.qmd
Normal file
@@ -0,0 +1,93 @@
|
|||||||
|
---
|
||||||
|
title: "For Students"
|
||||||
|
subtitle: "Build intuition for ML systems — without needing GPU hardware."
|
||||||
|
---
|
||||||
|
Whether you're taking your first ML systems course or preparing for industry interviews, MLSYSIM lets you experiment with real hardware specifications and see exactly *why* systems behave the way they do.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## What You'll Learn
|
||||||
|
|
||||||
|
By working through the MLSYSIM tutorials and exercises, you will:
|
||||||
|
|
||||||
|
- **Identify bottlenecks** — Determine whether a workload is memory-bound or compute-bound on any hardware, and understand *why*
|
||||||
|
- **Reason quantitatively** — Use real datasheet numbers (not made-up examples) to calculate latency, throughput, and cost
|
||||||
|
- **Build systems intuition** — See how batch size, precision, parallelism strategy, and datacenter location each affect performance
|
||||||
|
- **Think across the stack** — Connect workload characteristics to hardware specs to infrastructure constraints
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Your Learning Path
|
||||||
|
|
||||||
|
Start at the top and work through in order. Each tutorial builds on the one before it.
|
||||||
|
|
||||||
|
| Step | Tutorial | You'll Learn | Time |
|
||||||
|
|:-----|:---------|:-------------|:-----|
|
||||||
|
| 1 | [Hello World](tutorials/hello_world.qmd) | The roofline model, memory-bound vs. compute-bound, batch size sweeps | 15 min |
|
||||||
|
| 2 | [Sustainability Lab](tutorials/sustainability.qmd) | Energy, carbon footprint, regional grid effects | 20 min |
|
||||||
|
| 3 | [LLM Serving](tutorials/llm_serving.qmd) | TTFT vs. ITL, KV-cache pressure, the two phases of LLM inference | 25 min |
|
||||||
|
| 4 | [Distributed Training](tutorials/distributed.qmd) | Data/tensor/pipeline parallelism, communication overhead, scaling efficiency | 30 min |
|
||||||
|
|
||||||
|
::: {.callout-tip}
|
||||||
|
## Predict Before You Compute
|
||||||
|
Every tutorial includes "predict first" exercises. Before running code, write down what you expect. This practice builds the mental models that make you effective at systems reasoning.
|
||||||
|
:::
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## How MLSYSIM Pairs with the Textbook
|
||||||
|
|
||||||
|
MLSYSIM is the companion framework for the [Machine Learning Systems](https://mlsysbook.ai) textbook. Each solver maps to specific chapters:
|
||||||
|
|
||||||
|
| Textbook Topic | MLSYSIM Solver | What It Models |
|
||||||
|
|:---------------|:---------------|:---------------|
|
||||||
|
| Hardware Acceleration | SingleNodeSolver | Roofline analysis, compute vs. memory bottleneck |
|
||||||
|
| Model Serving | ServingSolver | TTFT, ITL, KV-cache memory |
|
||||||
|
| Distributed Training | DistributedSolver | 3D parallelism, all-reduce, pipeline bubbles |
|
||||||
|
| Compute Infrastructure | EconomicsSolver | CapEx, OpEx, TCO |
|
||||||
|
| Sustainable AI | SustainabilitySolver | Energy, carbon, water usage |
|
||||||
|
| Fault Tolerance | ReliabilitySolver | MTBF, checkpoint interval |
|
||||||
|
|
||||||
|
Not using the textbook? No problem — MLSYSIM is self-contained. The [Math Foundations](math.qmd) page documents every equation.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Prerequisites
|
||||||
|
|
||||||
|
- **Python**: Comfortable with functions, loops, and f-strings
|
||||||
|
- **Math**: Basic algebra (no calculus required — all solver equations are arithmetic)
|
||||||
|
- **ML**: Familiarity with terms like "model parameters," "inference," and "training" (the [Glossary](glossary.qmd) defines everything else)
|
||||||
|
|
||||||
|
No GPU, no cloud account, no special hardware required. Just:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
pip install mlsysim
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Quick Start
|
||||||
|
|
||||||
|
```python
|
||||||
|
import mlsysim
|
||||||
|
from mlsysim import Engine
|
||||||
|
|
||||||
|
# Load a model and hardware from the vetted registry
|
||||||
|
model = mlsysim.Models.ResNet50
|
||||||
|
gpu = mlsysim.Hardware.Cloud.A100
|
||||||
|
|
||||||
|
# Solve: is this workload memory-bound or compute-bound?
|
||||||
|
profile = Engine.solve(model=model, hardware=gpu, batch_size=1, precision="fp16")
|
||||||
|
|
||||||
|
print(f"Bottleneck: {profile.bottleneck}") # → Memory Bound
|
||||||
|
print(f"Latency: {profile.latency.to('ms'):~.2f}")
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Next Steps
|
||||||
|
|
||||||
|
- **[Getting Started](getting-started.qmd)** — Install MLSYSIM and run your first analysis
|
||||||
|
- **[Hello World Tutorial](tutorials/hello_world.qmd)** — Your first roofline analysis
|
||||||
|
- **[Glossary](glossary.qmd)** — Look up any unfamiliar term
|
||||||
|
- **[Math Foundations](math.qmd)** — The equations behind every solver
|
||||||
@@ -2,7 +2,6 @@
|
|||||||
title: "Getting Started"
|
title: "Getting Started"
|
||||||
subtitle: "Install MLSYSIM and run your first analysis in under 5 minutes."
|
subtitle: "Install MLSYSIM and run your first analysis in under 5 minutes."
|
||||||
---
|
---
|
||||||
|
|
||||||
::: {.callout-note}
|
::: {.callout-note}
|
||||||
## Prerequisites
|
## Prerequisites
|
||||||
MLSYSIM assumes basic Python familiarity (variables, functions, `pip install`). No prior ML or hardware knowledge is required. Key concepts like **roofline analysis**, **memory-bound vs. compute-bound**, and **FLOP/s** are explained in context throughout the tutorials. For a full reference of terms, see the [Glossary](glossary.qmd).
|
MLSYSIM assumes basic Python familiarity (variables, functions, `pip install`). No prior ML or hardware knowledge is required. Key concepts like **roofline analysis**, **memory-bound vs. compute-bound**, and **FLOP/s** are explained in context throughout the tutorials. For a full reference of terms, see the [Glossary](glossary.qmd).
|
||||||
|
|||||||
@@ -2,7 +2,6 @@
|
|||||||
title: "Glossary"
|
title: "Glossary"
|
||||||
subtitle: "Definitions for every term used in the MLSYSIM documentation."
|
subtitle: "Definitions for every term used in the MLSYSIM documentation."
|
||||||
---
|
---
|
||||||
|
|
||||||
This page defines every technical term used across the MLSYSIM documentation.
|
This page defines every technical term used across the MLSYSIM documentation.
|
||||||
When a term is first used on any page, it either links here or is defined inline.
|
When a term is first used on any page, it either links here or is defined inline.
|
||||||
|
|
||||||
|
|||||||
@@ -2,86 +2,7 @@
|
|||||||
title: "MLSYSIM"
|
title: "MLSYSIM"
|
||||||
page-layout: custom
|
page-layout: custom
|
||||||
sidebar: false
|
sidebar: false
|
||||||
format:
|
|
||||||
html:
|
|
||||||
toc: false
|
|
||||||
include-in-header:
|
|
||||||
text: |
|
|
||||||
<style>
|
|
||||||
.quarto-title, .quarto-title-meta, h1.title, .breadcrumb { display: none !important; }
|
|
||||||
#title-block-header { display: none !important; }
|
|
||||||
</style>
|
|
||||||
<script>
|
|
||||||
function copyInstall() {
|
|
||||||
navigator.clipboard.writeText('pip install mlsysim').then(function() {
|
|
||||||
var btn = document.getElementById('copy-btn');
|
|
||||||
var orig = btn.textContent;
|
|
||||||
btn.textContent = 'Copied!';
|
|
||||||
setTimeout(function() { btn.textContent = orig; }, 2000);
|
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
// Count-up animation for stats
|
|
||||||
document.addEventListener('DOMContentLoaded', function() {
|
|
||||||
var observer = new IntersectionObserver(function(entries) {
|
|
||||||
entries.forEach(function(entry) {
|
|
||||||
if (entry.isIntersecting) {
|
|
||||||
var nums = entry.target.querySelectorAll('.im-stat-num');
|
|
||||||
nums.forEach(function(el) {
|
|
||||||
var text = el.textContent.trim();
|
|
||||||
var suffix = text.replace(/[0-9]/g, '');
|
|
||||||
var target = parseInt(text);
|
|
||||||
if (isNaN(target)) return;
|
|
||||||
var duration = 1200;
|
|
||||||
var start = performance.now();
|
|
||||||
el.textContent = '0' + suffix;
|
|
||||||
function step(now) {
|
|
||||||
var progress = Math.min((now - start) / duration, 1);
|
|
||||||
var eased = 1 - Math.pow(1 - progress, 3);
|
|
||||||
el.textContent = Math.round(target * eased) + suffix;
|
|
||||||
if (progress < 1) requestAnimationFrame(step);
|
|
||||||
}
|
|
||||||
requestAnimationFrame(step);
|
|
||||||
});
|
|
||||||
observer.unobserve(entry.target);
|
|
||||||
}
|
|
||||||
});
|
|
||||||
}, { threshold: 0.5 });
|
|
||||||
|
|
||||||
var stats = document.querySelector('.im-stats');
|
|
||||||
if (stats) observer.observe(stats);
|
|
||||||
|
|
||||||
// Carousel
|
|
||||||
var slides = document.querySelectorAll('.im-slide');
|
|
||||||
var dots = document.querySelectorAll('.im-dot');
|
|
||||||
var current = 0;
|
|
||||||
var timer;
|
|
||||||
|
|
||||||
function showSlide(n) {
|
|
||||||
slides.forEach(function(s) { s.classList.remove('im-slide-active'); });
|
|
||||||
dots.forEach(function(d) { d.classList.remove('im-dot-active'); });
|
|
||||||
current = n;
|
|
||||||
slides[current].classList.add('im-slide-active');
|
|
||||||
dots[current].classList.add('im-dot-active');
|
|
||||||
}
|
|
||||||
|
|
||||||
function nextSlide() { showSlide((current + 1) % slides.length); }
|
|
||||||
|
|
||||||
function startTimer() { timer = setInterval(nextSlide, 5000); }
|
|
||||||
|
|
||||||
dots.forEach(function(dot) {
|
|
||||||
dot.addEventListener('click', function() {
|
|
||||||
clearInterval(timer);
|
|
||||||
showSlide(parseInt(this.dataset.slide));
|
|
||||||
startTimer();
|
|
||||||
});
|
|
||||||
});
|
|
||||||
|
|
||||||
if (slides.length > 0) startTimer();
|
|
||||||
});
|
|
||||||
</script>
|
|
||||||
---
|
---
|
||||||
|
|
||||||
<!-- ============================================================
|
<!-- ============================================================
|
||||||
HERO (one cohesive dark section)
|
HERO (one cohesive dark section)
|
||||||
============================================================ -->
|
============================================================ -->
|
||||||
@@ -98,11 +19,18 @@ MLSYSIM
|
|||||||
:::
|
:::
|
||||||
|
|
||||||
::: {.im-subtitle}
|
::: {.im-subtitle}
|
||||||
Predict ML system performance, cost, and carbon from first principles.
|
Predict ML system performance, cost, and carbon.<br/>From first principles.
|
||||||
:::
|
:::
|
||||||
|
|
||||||
|
<div class="im-stats">
|
||||||
|
<div class="im-stat"><span class="im-stat-num">Fundamental</span><span class="im-stat-label">Physics Solvers</span></div>
|
||||||
|
<div class="im-stat"><span class="im-stat-num">18+</span><span class="im-stat-label">Vetted Hardware Specs</span></div>
|
||||||
|
<div class="im-stat"><span class="im-stat-num">13+</span><span class="im-stat-label">Reference Workloads</span></div>
|
||||||
|
<div class="im-stat"><span class="im-stat-num">4</span><span class="im-stat-label">Carbon-Aware Regions</span></div>
|
||||||
|
</div>
|
||||||
|
|
||||||
::: {.im-hero-desc}
|
::: {.im-hero-desc}
|
||||||
Analytical solvers for reasoning about ML workloads, from microcontrollers to thousand-GPU clusters, without provisioning any hardware.
|
Reason about ML workloads—from microcontrollers to GPU clusters—without provisioning any hardware.
|
||||||
:::
|
:::
|
||||||
|
|
||||||
::: {.im-install}
|
::: {.im-install}
|
||||||
@@ -124,10 +52,12 @@ Analytical solvers for reasoning about ML workloads, from microcontrollers to th
|
|||||||
<div class="im-hero-inner">
|
<div class="im-hero-inner">
|
||||||
<div class="im-carousel">
|
<div class="im-carousel">
|
||||||
<div class="im-carousel-track">
|
<div class="im-carousel-track">
|
||||||
|
<button class="im-arrow im-arrow-prev" aria-label="Previous slide">‹</button>
|
||||||
|
<button class="im-arrow im-arrow-next" aria-label="Next slide">›</button>
|
||||||
<div class="im-slide im-slide-active" data-index="0">
|
<div class="im-slide im-slide-active" data-index="0">
|
||||||
<div class="im-slide-label">Roofline Analysis</div>
|
<div class="im-slide-label">Roofline Analysis</div>
|
||||||
<div class="im-slide-viz">
|
<div class="im-slide-viz">
|
||||||
<svg viewBox="0 0 320 120" class="im-roofline-svg">
|
<svg viewBox="0 0 320 130" class="im-roofline-svg">
|
||||||
<line x1="40" y1="100" x2="300" y2="100" stroke="rgba(148,163,184,0.3)" stroke-width="1"/>
|
<line x1="40" y1="100" x2="300" y2="100" stroke="rgba(148,163,184,0.3)" stroke-width="1"/>
|
||||||
<line x1="40" y1="20" x2="40" y2="100" stroke="rgba(148,163,184,0.3)" stroke-width="1"/>
|
<line x1="40" y1="20" x2="40" y2="100" stroke="rgba(148,163,184,0.3)" stroke-width="1"/>
|
||||||
<text x="170" y="115" fill="#64748b" font-size="9" text-anchor="middle">Arithmetic Intensity (FLOP/Byte)</text>
|
<text x="170" y="115" fill="#64748b" font-size="9" text-anchor="middle">Arithmetic Intensity (FLOP/Byte)</text>
|
||||||
@@ -147,18 +77,21 @@ Analytical solvers for reasoning about ML workloads, from microcontrollers to th
|
|||||||
<div class="im-slide" data-index="1">
|
<div class="im-slide" data-index="1">
|
||||||
<div class="im-slide-label">Hardware Comparison</div>
|
<div class="im-slide-label">Hardware Comparison</div>
|
||||||
<div class="im-slide-viz">
|
<div class="im-slide-viz">
|
||||||
<svg viewBox="0 0 320 120" class="im-bars-svg">
|
<svg viewBox="0 0 320 130" class="im-bars-svg">
|
||||||
<text x="50" y="22" fill="#94a3b8" font-size="9" text-anchor="end">H100</text>
|
<text x="50" y="22" fill="#94a3b8" font-size="9" text-anchor="end">H100</text>
|
||||||
<rect x="55" y="12" width="0" height="14" rx="3" fill="#38bdf8"><animate attributeName="width" from="0" to="150" dur="1.5s" fill="freeze" begin="0s"/></rect>
|
<rect x="55" y="12" width="0" height="14" rx="3" fill="#38bdf8"><animate attributeName="width" from="0" to="200" dur="1.5s" fill="freeze" begin="0s"/></rect>
|
||||||
<text x="210" y="23" fill="#94a3b8" font-size="8">990 TFLOP/s</text>
|
<text x="260" y="23" fill="#94a3b8" font-size="8">990 TFLOP/s</text>
|
||||||
|
|
||||||
<text x="50" y="47" fill="#94a3b8" font-size="9" text-anchor="end">A100</text>
|
<text x="50" y="47" fill="#94a3b8" font-size="9" text-anchor="end">A100</text>
|
||||||
<rect x="55" y="37" width="0" height="14" rx="3" fill="#38bdf8" opacity="0.7"><animate attributeName="width" from="0" to="95" dur="1.5s" fill="freeze" begin="0.1s"/></rect>
|
<rect x="55" y="37" width="0" height="14" rx="3" fill="#38bdf8" opacity="0.7"><animate attributeName="width" from="0" to="120" dur="1.5s" fill="freeze" begin="0.1s"/></rect>
|
||||||
<text x="155" y="48" fill="#94a3b8" font-size="8">312 TFLOP/s</text>
|
<text x="180" y="48" fill="#94a3b8" font-size="8">312 TFLOP/s</text>
|
||||||
|
|
||||||
<text x="50" y="72" fill="#94a3b8" font-size="9" text-anchor="end">Jetson</text>
|
<text x="50" y="72" fill="#94a3b8" font-size="9" text-anchor="end">Jetson</text>
|
||||||
<rect x="55" y="62" width="0" height="14" rx="3" fill="#38bdf8" opacity="0.4"><animate attributeName="width" from="0" to="8" dur="1.5s" fill="freeze" begin="0.2s"/></rect>
|
<rect x="55" y="62" width="0" height="14" rx="3" fill="#38bdf8" opacity="0.4"><animate attributeName="width" from="0" to="15" dur="1.5s" fill="freeze" begin="0.2s"/></rect>
|
||||||
<text x="68" y="73" fill="#94a3b8" font-size="8">25 TFLOP/s</text>
|
<text x="75" y="73" fill="#94a3b8" font-size="8">25 TFLOP/s</text>
|
||||||
|
|
||||||
<text x="50" y="97" fill="#94a3b8" font-size="9" text-anchor="end">ESP32</text>
|
<text x="50" y="97" fill="#94a3b8" font-size="9" text-anchor="end">ESP32</text>
|
||||||
<rect x="55" y="87" width="0" height="14" rx="3" fill="#38bdf8" opacity="0.2"><animate attributeName="width" from="0" to="1" dur="1.5s" fill="freeze" begin="0.3s"/></rect>
|
<rect x="55" y="87" width="0" height="14" rx="3" fill="#38bdf8" opacity="0.2"><animate attributeName="width" from="0" to="2" dur="1.5s" fill="freeze" begin="0.3s"/></rect>
|
||||||
<text x="62" y="98" fill="#94a3b8" font-size="8">0.5 GFLOP/s</text>
|
<text x="62" y="98" fill="#94a3b8" font-size="8">0.5 GFLOP/s</text>
|
||||||
</svg>
|
</svg>
|
||||||
</div>
|
</div>
|
||||||
@@ -167,19 +100,28 @@ Analytical solvers for reasoning about ML workloads, from microcontrollers to th
|
|||||||
<div class="im-slide" data-index="2">
|
<div class="im-slide" data-index="2">
|
||||||
<div class="im-slide-label">Sustainability Analysis</div>
|
<div class="im-slide-label">Sustainability Analysis</div>
|
||||||
<div class="im-slide-viz">
|
<div class="im-slide-viz">
|
||||||
<svg viewBox="0 0 320 120" class="im-sustain-svg">
|
<svg viewBox="0 0 320 130" class="im-sustain-svg">
|
||||||
|
<line x1="85" y1="10" x2="85" y2="110" stroke="rgba(148,163,184,0.1)" stroke-width="1"/>
|
||||||
<text x="80" y="22" fill="#94a3b8" font-size="9" text-anchor="end">Quebec</text>
|
<text x="80" y="22" fill="#94a3b8" font-size="9" text-anchor="end">Quebec</text>
|
||||||
<rect x="85" y="12" width="0" height="14" rx="3" fill="#10b981"><animate attributeName="width" from="0" to="12" dur="1.5s" fill="freeze"/></rect>
|
<rect x="85" y="12" width="0" height="14" rx="3" fill="#10b981">
|
||||||
<text x="102" y="23" fill="#94a3b8" font-size="8">24 g CO₂/kWh</text>
|
<animate attributeName="width" from="0" to="10" dur="1.5s" fill="freeze" begin="0s"/>
|
||||||
|
</rect>
|
||||||
|
<text x="100" y="23" fill="#94a3b8" font-size="8">20 g CO₂/kWh</text>
|
||||||
<text x="80" y="47" fill="#94a3b8" font-size="9" text-anchor="end">Norway</text>
|
<text x="80" y="47" fill="#94a3b8" font-size="9" text-anchor="end">Norway</text>
|
||||||
<rect x="85" y="37" width="0" height="14" rx="3" fill="#10b981" opacity="0.8"><animate attributeName="width" from="0" to="16" dur="1.5s" fill="freeze" begin="0.1s"/></rect>
|
<rect x="85" y="37" width="0" height="14" rx="3" fill="#10b981" opacity="0.8">
|
||||||
<text x="106" y="48" fill="#94a3b8" font-size="8">29 g CO₂/kWh</text>
|
<animate attributeName="width" from="0" to="5" dur="1.5s" fill="freeze" begin="0.1s"/>
|
||||||
|
</rect>
|
||||||
|
<text x="95" y="48" fill="#94a3b8" font-size="8">10 g CO₂/kWh</text>
|
||||||
<text x="80" y="72" fill="#94a3b8" font-size="9" text-anchor="end">US Avg</text>
|
<text x="80" y="72" fill="#94a3b8" font-size="9" text-anchor="end">US Avg</text>
|
||||||
<rect x="85" y="62" width="0" height="14" rx="3" fill="#f59e0b"><animate attributeName="width" from="0" to="110" dur="1.5s" fill="freeze" begin="0.2s"/></rect>
|
<rect x="85" y="62" width="0" height="14" rx="3" fill="#f59e0b">
|
||||||
<text x="200" y="73" fill="#94a3b8" font-size="8">390 g CO₂/kWh</text>
|
<animate attributeName="width" from="0" to="95" dur="1.5s" fill="freeze" begin="0.2s"/>
|
||||||
|
</rect>
|
||||||
|
<text x="185" y="73" fill="#94a3b8" font-size="8">390 g CO₂/kWh</text>
|
||||||
<text x="80" y="97" fill="#94a3b8" font-size="9" text-anchor="end">Poland</text>
|
<text x="80" y="97" fill="#94a3b8" font-size="9" text-anchor="end">Poland</text>
|
||||||
<rect x="85" y="87" width="0" height="14" rx="3" fill="#ef4444"><animate attributeName="width" from="0" to="200" dur="1.5s" fill="freeze" begin="0.3s"/></rect>
|
<rect x="85" y="87" width="0" height="14" rx="3" fill="#ef4444">
|
||||||
<text x="290" y="98" fill="#94a3b8" font-size="8">700+ g CO₂/kWh</text>
|
<animate attributeName="width" from="0" to="200" dur="1.5s" fill="freeze" begin="0.3s"/>
|
||||||
|
</rect>
|
||||||
|
<text x="290" y="98" fill="#94a3b8" font-size="8">820 g CO₂/kWh</text>
|
||||||
</svg>
|
</svg>
|
||||||
</div>
|
</div>
|
||||||
<div class="im-slide-caption">Same workload, different region. Up to 41x difference in carbon footprint.</div>
|
<div class="im-slide-caption">Same workload, different region. Up to 41x difference in carbon footprint.</div>
|
||||||
@@ -187,37 +129,87 @@ Analytical solvers for reasoning about ML workloads, from microcontrollers to th
|
|||||||
<div class="im-slide" data-index="3">
|
<div class="im-slide" data-index="3">
|
||||||
<div class="im-slide-label">LLM Serving</div>
|
<div class="im-slide-label">LLM Serving</div>
|
||||||
<div class="im-slide-viz">
|
<div class="im-slide-viz">
|
||||||
<svg viewBox="0 0 320 120" class="im-serving-svg">
|
<svg viewBox="0 0 320 130" class="im-serving-svg">
|
||||||
<text x="160" y="15" fill="#94a3b8" font-size="9" text-anchor="middle">Llama-3.1-8B on H100</text>
|
<text x="160" y="15" fill="#94a3b8" font-size="9" text-anchor="middle">Llama-3.1-8B on H100</text>
|
||||||
<rect x="30" y="30" width="120" height="50" rx="6" fill="rgba(56,189,248,0.1)" stroke="rgba(56,189,248,0.3)" stroke-width="1"/>
|
<rect x="30" y="30" width="120" height="50" rx="6" fill="rgba(56,189,248,0.1)" stroke="rgba(56,189,248,0.3)" stroke-width="1"/>
|
||||||
<text x="90" y="48" fill="#38bdf8" font-size="9" font-weight="bold" text-anchor="middle">Pre-fill</text>
|
<text x="90" y="46" fill="#38bdf8" font-size="9" font-weight="bold" text-anchor="middle">Pre-fill</text>
|
||||||
<text x="90" y="62" fill="#7dd3fc" font-size="18" font-weight="bold" text-anchor="middle">4.2 ms</text>
|
<text x="90" y="64" fill="#7dd3fc" font-size="18" font-weight="bold" text-anchor="middle">4.2 ms</text>
|
||||||
<text x="90" y="74" fill="#64748b" font-size="7" text-anchor="middle">TTFT (compute-bound)</text>
|
<text x="90" y="76" fill="#64748b" font-size="7" text-anchor="middle">TTFT (compute-bound)</text>
|
||||||
<text x="160" y="58" fill="#94a3b8" font-size="14">→</text>
|
<text x="160" y="60" fill="#94a3b8" font-size="14">→</text>
|
||||||
<rect x="180" y="30" width="120" height="50" rx="6" fill="rgba(16,185,129,0.1)" stroke="rgba(16,185,129,0.3)" stroke-width="1"/>
|
<rect x="180" y="30" width="120" height="50" rx="6" fill="rgba(16,185,129,0.1)" stroke="rgba(16,185,129,0.3)" stroke-width="1"/>
|
||||||
<text x="240" y="48" fill="#10b981" font-size="9" font-weight="bold" text-anchor="middle">Decode</text>
|
<text x="240" y="46" fill="#10b981" font-size="9" font-weight="bold" text-anchor="middle">Decode</text>
|
||||||
<text x="240" y="62" fill="#6ee7b7" font-size="18" font-weight="bold" text-anchor="middle">0.8 ms</text>
|
<text x="240" y="64" fill="#6ee7b7" font-size="18" font-weight="bold" text-anchor="middle">0.8 ms</text>
|
||||||
<text x="240" y="74" fill="#64748b" font-size="7" text-anchor="middle">ITL (memory-bound)</text>
|
<text x="240" y="76" fill="#64748b" font-size="7" text-anchor="middle">ITL (memory-bound)</text>
|
||||||
<rect x="70" y="90" width="180" height="22" rx="4" fill="rgba(245,158,11,0.1)" stroke="rgba(245,158,11,0.3)" stroke-width="1"/>
|
<rect x="70" y="96" width="180" height="22" rx="4" fill="rgba(245,158,11,0.1)" stroke="rgba(245,158,11,0.3)" stroke-width="1"/>
|
||||||
<text x="160" y="105" fill="#f59e0b" font-size="8" text-anchor="middle">KV-Cache: 2.1 GB / 80 GB available</text>
|
<text x="160" y="111" fill="#f59e0b" font-size="8" text-anchor="middle">KV-Cache: 2.1 GB / 80 GB available</text>
|
||||||
</svg>
|
</svg>
|
||||||
</div>
|
</div>
|
||||||
<div class="im-slide-caption">Model the two phases of autoregressive inference and KV-cache memory pressure.</div>
|
<div class="im-slide-caption">Model the two phases of autoregressive inference and KV-cache memory pressure.</div>
|
||||||
</div>
|
</div>
|
||||||
|
<div class="im-slide" data-index="4">
|
||||||
|
<div class="im-slide-label">Distributed Training</div>
|
||||||
|
<div class="im-slide-viz">
|
||||||
|
<svg viewBox="0 0 320 130" class="im-distributed-svg">
|
||||||
|
<text x="160" y="15" fill="#94a3b8" font-size="9" text-anchor="middle">256× H100 — GPT-3 175B</text>
|
||||||
|
<!-- Parallelism strategy boxes -->
|
||||||
|
<rect x="10" y="28" width="95" height="42" rx="6" fill="rgba(124,58,237,0.1)" stroke="rgba(124,58,237,0.3)" stroke-width="1"/>
|
||||||
|
<text x="57.5" y="45" fill="#a78bfa" font-size="8" font-weight="bold" text-anchor="middle">Data Parallel</text>
|
||||||
|
<text x="57.5" y="62" fill="#c4b5fd" font-size="16" font-weight="bold" text-anchor="middle">32×</text>
|
||||||
|
|
||||||
|
<rect x="112.5" y="28" width="95" height="42" rx="6" fill="rgba(56,189,248,0.1)" stroke="rgba(56,189,248,0.3)" stroke-width="1"/>
|
||||||
|
<text x="160" y="45" fill="#38bdf8" font-size="8" font-weight="bold" text-anchor="middle">Tensor Parallel</text>
|
||||||
|
<text x="160" y="62" fill="#7dd3fc" font-size="16" font-weight="bold" text-anchor="middle">4×</text>
|
||||||
|
|
||||||
|
<rect x="215" y="28" width="95" height="42" rx="6" fill="rgba(16,185,129,0.1)" stroke="rgba(16,185,129,0.3)" stroke-width="1"/>
|
||||||
|
<text x="262.5" y="45" fill="#10b981" font-size="8" font-weight="bold" text-anchor="middle">Pipeline Parallel</text>
|
||||||
|
<text x="262.5" y="62" fill="#6ee7b7" font-size="16" font-weight="bold" text-anchor="middle">2×</text>
|
||||||
|
|
||||||
|
<!-- Results row -->
|
||||||
|
<line x1="20" y1="82" x2="300" y2="82" stroke="rgba(148,163,184,0.15)" stroke-width="1"/>
|
||||||
|
<text x="85" y="98" fill="#94a3b8" font-size="8" text-anchor="middle">Scaling Efficiency</text>
|
||||||
|
<text x="85" y="118" fill="#a78bfa" font-size="18" font-weight="bold" text-anchor="middle">74%</text>
|
||||||
|
<text x="235" y="98" fill="#94a3b8" font-size="8" text-anchor="middle">Pipeline Bubble</text>
|
||||||
|
<text x="235" y="114" fill="#f59e0b" font-size="18" font-weight="bold" text-anchor="middle">6.3%</text>
|
||||||
|
</svg>
|
||||||
|
</div>
|
||||||
|
<div class="im-slide-caption">3D parallelism decomposition: data, tensor, and pipeline parallel scaling on GPU clusters.</div>
|
||||||
|
</div>
|
||||||
|
<div class="im-slide" data-index="5">
|
||||||
|
<div class="im-slide-label">Total Cost of Ownership</div>
|
||||||
|
<div class="im-slide-viz">
|
||||||
|
<svg viewBox="0 0 320 130" class="im-tco-svg">
|
||||||
|
<text x="160" y="15" fill="#94a3b8" font-size="9" text-anchor="middle">64× H100 Cluster — 3-Year TCO</text>
|
||||||
|
<!-- Stacked cost bars -->
|
||||||
|
<text x="50" y="42" fill="#94a3b8" font-size="9" text-anchor="end">CapEx</text>
|
||||||
|
<rect x="55" y="30" width="0" height="16" rx="3" fill="#38bdf8"><animate attributeName="width" from="0" to="200" dur="1.5s" fill="freeze"/></rect>
|
||||||
|
<text x="260" y="42" fill="#94a3b8" font-size="8">$2.0M</text>
|
||||||
|
|
||||||
|
<text x="50" y="68" fill="#94a3b8" font-size="9" text-anchor="end">Energy</text>
|
||||||
|
<rect x="55" y="56" width="0" height="16" rx="3" fill="#f59e0b"><animate attributeName="width" from="0" to="120" dur="1.5s" fill="freeze" begin="0.1s"/></rect>
|
||||||
|
<text x="180" y="68" fill="#94a3b8" font-size="8">$1.2M</text>
|
||||||
|
|
||||||
|
<text x="50" y="94" fill="#94a3b8" font-size="9" text-anchor="end">Maint.</text>
|
||||||
|
<rect x="55" y="82" width="0" height="16" rx="3" fill="#10b981"><animate attributeName="width" from="0" to="50" dur="1.5s" fill="freeze" begin="0.2s"/></rect>
|
||||||
|
<text x="110" y="94" fill="#94a3b8" font-size="8">$0.5M</text>
|
||||||
|
|
||||||
|
<!-- Total -->
|
||||||
|
<line x1="55" y1="108" x2="260" y2="108" stroke="rgba(148,163,184,0.2)" stroke-width="1"/>
|
||||||
|
<text x="55" y="124" fill="#94a3b8" font-size="9">Total TCO</text>
|
||||||
|
<text x="260" y="124" fill="#e2e8f0" font-size="14" font-weight="bold" text-anchor="end">$3.7M</text>
|
||||||
|
</svg>
|
||||||
|
</div>
|
||||||
|
<div class="im-slide-caption">Break down hardware, energy, and maintenance costs over any time horizon.</div>
|
||||||
|
</div>
|
||||||
</div>
|
</div>
|
||||||
<div class="im-carousel-dots">
|
<div class="im-carousel-dots">
|
||||||
<button class="im-dot im-dot-active" data-slide="0" aria-label="Roofline Analysis"></button>
|
<button class="im-dot im-dot-active" data-slide="0" aria-label="Roofline Analysis"></button>
|
||||||
<button class="im-dot" data-slide="1" aria-label="Hardware Comparison"></button>
|
<button class="im-dot" data-slide="1" aria-label="Hardware Comparison"></button>
|
||||||
<button class="im-dot" data-slide="2" aria-label="Sustainability"></button>
|
<button class="im-dot" data-slide="2" aria-label="Sustainability"></button>
|
||||||
<button class="im-dot" data-slide="3" aria-label="LLM Serving"></button>
|
<button class="im-dot" data-slide="3" aria-label="LLM Serving"></button>
|
||||||
|
<button class="im-dot" data-slide="4" aria-label="Distributed Training"></button>
|
||||||
|
<button class="im-dot" data-slide="5" aria-label="Total Cost of Ownership"></button>
|
||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
<div class="im-stats">
|
|
||||||
<div class="im-stat"><span class="im-stat-num">6</span><span class="im-stat-label">Analytical Solvers</span></div>
|
|
||||||
<div class="im-stat"><span class="im-stat-num">18+</span><span class="im-stat-label">Hardware Devices</span></div>
|
|
||||||
<div class="im-stat"><span class="im-stat-num">13+</span><span class="im-stat-label">ML Workloads</span></div>
|
|
||||||
<div class="im-stat"><span class="im-stat-num">4</span><span class="im-stat-label">Grid Regions</span></div>
|
|
||||||
</div>
|
|
||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
```
|
```
|
||||||
@@ -252,7 +244,7 @@ print(f"Latency: {profile.latency.to('ms'):~.2f}") # → 0.34 ms
|
|||||||
print(f"Throughput: {profile.throughput:.0f} img/s") # → 2941 img/s
|
print(f"Throughput: {profile.throughput:.0f} img/s") # → 2941 img/s
|
||||||
```
|
```
|
||||||
|
|
||||||
At batch=1, ResNet-50 loads ~50 MB of weights but performs only ~8 GFLOPs, making it firmly memory-bound on any modern GPU. The solver identifies this in microseconds using the **Iron Law**:
|
At batch=1, ResNet-50 loads ~50 MB of weights but performs only ~8 GFLOPs, making it firmly memory-bound on any modern GPU. The solver identifies this in microseconds using the **Iron Law** [@williams2009roofline]:
|
||||||
|
|
||||||
$$T = \max\!\left(\frac{\text{FLOPs}}{\text{Peak} \times \eta},\ \frac{\text{Bytes}}{\text{BW}}\right)$$
|
$$T = \max\!\left(\frac{\text{FLOPs}}{\text{Peak} \times \eta},\ \frac{\text{Bytes}}{\text{BW}}\right)$$
|
||||||
|
|
||||||
@@ -272,42 +264,42 @@ Every solver takes typed registry objects and returns analytically grounded esti
|
|||||||
::: {.im-solver-card .im-solver-roofline}
|
::: {.im-solver-card .im-solver-roofline}
|
||||||
::: {.im-solver-icon}
|
::: {.im-solver-icon}
|
||||||
:::
|
:::
|
||||||
**Roofline Analysis**\
|
**Roofline Analysis**
|
||||||
Compute vs. memory bottleneck identification using the Iron Law. Single-node latency and throughput.
|
Compute vs. memory bottleneck identification using the Iron Law. Single-node latency and throughput.
|
||||||
:::
|
:::
|
||||||
|
|
||||||
::: {.im-solver-card .im-solver-distributed}
|
::: {.im-solver-card .im-solver-distributed}
|
||||||
::: {.im-solver-icon}
|
::: {.im-solver-icon}
|
||||||
:::
|
:::
|
||||||
**3D Parallelism**\
|
**3D Parallelism**
|
||||||
Data, tensor, and pipeline parallel scaling efficiency. Ring all-reduce and pipeline bubble overhead.
|
Data, tensor, and pipeline parallel scaling efficiency. Ring all-reduce and pipeline bubble overhead.
|
||||||
:::
|
:::
|
||||||
|
|
||||||
::: {.im-solver-card .im-solver-serving}
|
::: {.im-solver-card .im-solver-serving}
|
||||||
::: {.im-solver-icon}
|
::: {.im-solver-icon}
|
||||||
:::
|
:::
|
||||||
**LLM Serving**\
|
**LLM Serving**
|
||||||
Time-to-first-token (TTFT), inter-token latency (ITL), and KV-cache memory pressure.
|
Time-to-first-token (TTFT), inter-token latency (ITL), and KV-cache memory pressure.
|
||||||
:::
|
:::
|
||||||
|
|
||||||
::: {.im-solver-card .im-solver-tco}
|
::: {.im-solver-card .im-solver-tco}
|
||||||
::: {.im-solver-icon}
|
::: {.im-solver-icon}
|
||||||
:::
|
:::
|
||||||
**Total Cost of Ownership**\
|
**Total Cost of Ownership**
|
||||||
CapEx, OpEx, electricity, maintenance, and per-query economics over any time horizon.
|
CapEx, OpEx, electricity, maintenance, and per-query economics over any time horizon.
|
||||||
:::
|
:::
|
||||||
|
|
||||||
::: {.im-solver-card .im-solver-sustain}
|
::: {.im-solver-card .im-solver-sustain}
|
||||||
::: {.im-solver-icon}
|
::: {.im-solver-icon}
|
||||||
:::
|
:::
|
||||||
**Sustainability**\
|
**Sustainability**
|
||||||
Energy, carbon footprint (kg CO₂e), and water usage across datacenter regions.
|
Energy, carbon footprint (kg CO₂e), and water usage across datacenter regions.
|
||||||
:::
|
:::
|
||||||
|
|
||||||
::: {.im-solver-card .im-solver-reliability}
|
::: {.im-solver-card .im-solver-reliability}
|
||||||
::: {.im-solver-icon}
|
::: {.im-solver-icon}
|
||||||
:::
|
:::
|
||||||
**Reliability**\
|
**Reliability**
|
||||||
Fleet MTBF, failure probability, and Young-Daly optimal checkpoint interval.
|
Fleet MTBF, failure probability, and Young-Daly optimal checkpoint interval.
|
||||||
:::
|
:::
|
||||||
|
|
||||||
@@ -368,21 +360,21 @@ Same model, same GPU, yet up to 41x difference in carbon footprint depending on
|
|||||||
::: {.im-audience}
|
::: {.im-audience}
|
||||||
|
|
||||||
::: {.im-audience-item .im-aud-student}
|
::: {.im-audience-item .im-aud-student}
|
||||||
**Students**
|
[**Students**](for-students.qmd)
|
||||||
|
|
||||||
Build intuition for *why* ML systems behave as they do. Run roofline analysis, see the memory wall, compute carbon footprints, all without needing GPU hardware. Pairs chapter-by-chapter with the textbook.
|
Build intuition for *why* ML systems behave as they do. Run roofline analysis, see the memory wall, compute carbon footprints — all without needing GPU hardware. [See learning path →](for-students.qmd)
|
||||||
:::
|
:::
|
||||||
|
|
||||||
::: {.im-audience-item .im-aud-instructor}
|
::: {.im-audience-item .im-aud-instructor}
|
||||||
**Instructors**
|
[**Instructors**](for-instructors.qmd)
|
||||||
|
|
||||||
Assign analytically grounded problem sets with deterministic, reproducible outputs. All specs sourced from vetted datasheets. Works in Jupyter and Quarto notebooks.
|
Assign analytically grounded problem sets with deterministic, reproducible outputs. All specs sourced from vetted datasheets. [See course integration →](for-instructors.qmd)
|
||||||
:::
|
:::
|
||||||
|
|
||||||
::: {.im-audience-item .im-aud-engineer}
|
::: {.im-audience-item .im-aud-engineer}
|
||||||
**Engineers & Researchers**
|
[**Engineers & Researchers**](for-engineers.qmd)
|
||||||
|
|
||||||
Pre-deployment estimates for any architecture. Model distributed overheads, LLM serving latency, and multi-region sustainability before provisioning hardware.
|
Pre-deployment estimates for any architecture. Model distributed overheads, LLM serving latency, and multi-region sustainability before provisioning hardware. [See quick API guide →](for-engineers.qmd)
|
||||||
:::
|
:::
|
||||||
|
|
||||||
:::
|
:::
|
||||||
|
|||||||
@@ -2,7 +2,6 @@
|
|||||||
title: "Mathematical Foundations"
|
title: "Mathematical Foundations"
|
||||||
subtitle: "The First-Principles Equations Behind Every MLSYSIM Solver"
|
subtitle: "The First-Principles Equations Behind Every MLSYSIM Solver"
|
||||||
---
|
---
|
||||||
|
|
||||||
MLSYSIM avoids "black box" heuristics. Every output traces back to one of the equations below.
|
MLSYSIM avoids "black box" heuristics. Every output traces back to one of the equations below.
|
||||||
Before diving into code, read this page to understand *what* the solvers are computing and *why*.
|
Before diving into code, read this page to understand *what* the solvers are computing and *why*.
|
||||||
|
|
||||||
@@ -18,9 +17,12 @@ Click any solver name to go directly to its API documentation.
|
|||||||
|
|
||||||
*Implemented in [`mlsysim.core.solver.SingleNodeSolver`](api/core.solver.SingleNodeSolver.qmd).*
|
*Implemented in [`mlsysim.core.solver.SingleNodeSolver`](api/core.solver.SingleNodeSolver.qmd).*
|
||||||
|
|
||||||
**The physical intuition**: Hardware has two speed limits—how fast it can compute, and how fast it can
|
::: {.callout-note appearance="simple" icon=false}
|
||||||
move data from memory to the compute units. Your actual throughput is determined by whichever limit
|
**💡 Intuition: The Roofline Bottleneck**
|
||||||
you hit first. This is why we take the *maximum* of two terms, not their sum.
|
Hardware has two speed limits—how fast it can compute, and how fast it can move data from memory to the compute units. Your actual throughput is determined by whichever limit you hit first. This is why we take the *maximum* of two terms, not their sum.
|
||||||
|
|
||||||
|
**📚 Source:** @williams2009roofline
|
||||||
|
:::
|
||||||
|
|
||||||
$$
|
$$
|
||||||
T = \max \left( \frac{\text{FLOPs}}{\text{Peak\_FLOPs} \times \eta},\ \frac{\text{Bytes}}{\text{Memory\_BW}} \right) + \text{Dispatch\_Tax}
|
T = \max \left( \frac{\text{FLOPs}}{\text{Peak\_FLOPs} \times \eta},\ \frac{\text{Bytes}}{\text{Memory\_BW}} \right) + \text{Dispatch\_Tax}
|
||||||
@@ -96,25 +98,45 @@ at low batch sizes. Upgrading from 100 Gb Ethernet to InfiniBand NDR (400 Gb/s)
|
|||||||
|
|
||||||
### 2.3 Pipeline Parallelism Bubble
|
### 2.3 Pipeline Parallelism Bubble
|
||||||
|
|
||||||
**Pipeline parallelism** splits a model's layers across multiple stages (nodes). Stage 1
|
**Pipeline parallelism** splits a model's layers across multiple stages (nodes). Stage 1 processes layers 1–20, stage 2 processes layers 21–40, and so on. This allows models too large for a single GPU to be trained across multiple nodes.
|
||||||
processes layers 1–20, stage 2 processes layers 21–40, and so on. This allows models too large
|
|
||||||
for a single GPU to be trained across multiple nodes.
|
|
||||||
|
|
||||||
The cost is a **pipeline bubble**: at the start of each batch, downstream stages sit idle
|
::: {.callout-note appearance="simple" icon=false}
|
||||||
while waiting for upstream stages to produce output. When a pipeline of depth $P$ processes
|
**💡 Intuition: Shrinking the Pipeline Bubble**
|
||||||
$M$ microbatches, the fraction of time spent idle is:
|
In standard 1F1B pipeline parallelism, GPUs sit idle waiting for microbatches to traverse the network. You can't change the speed of light, but you *can* change the software schedule. By assigning multiple "virtual stages" ($V$) to a single GPU, we interleave the execution. While a GPU is waiting for the next microbatch of its *first* virtual stage, it can compute a microbatch for its *second* virtual stage, effectively hiding the network latency behind useful compute.
|
||||||
|
|
||||||
|
**📚 Source:** @narayanan2021efficient
|
||||||
|
:::
|
||||||
|
|
||||||
|
The cost of pipelining is a **pipeline bubble**: at the start of each batch, downstream stages sit idle while waiting for upstream stages to produce output. When a pipeline of depth $P$ processes $M$ microbatches with $V$ virtual stages per GPU, the fraction of time spent idle is:
|
||||||
|
|
||||||
$$
|
$$
|
||||||
\text{Bubble Fraction} = \frac{P - 1}{P - 1 + M}
|
\text{Bubble Fraction} = \frac{P - 1}{V \times M + P - 1}
|
||||||
$$
|
$$
|
||||||
|
|
||||||
The intuition: with $P$ stages and $M$ microbatches, the pipeline takes $P - 1 + M$ time
|
The intuition: with $P$ stages and $M$ microbatches, the pipeline takes time to fill and drain. The solution is to either increase $M$ (more microbatches) or increase $V$ (interleaved schedules). Both make the startup and drain phases a smaller fraction of total time.
|
||||||
steps to complete, but only $M$ of those steps have all stages active. The solution is to
|
|
||||||
increase $M$ — more microbatches mean the startup and drain phases become a smaller fraction
|
|
||||||
of total time.
|
|
||||||
|
|
||||||
**Implication**: To keep the bubble below 5%, you need $M \geq 19 \cdot (P-1)$ microbatches.
|
**Implication**: To keep the bubble below 5% using standard 1F1B ($V=1$), you need $M \geq 19 \cdot (P-1)$ microbatches. With a 4-stage pipeline ($P=4$), you need at least 57 microbatches. By using $V=2$ virtual stages, you cut the required microbatches in half.
|
||||||
With a 4-stage pipeline (P=4), you need at least 57 microbatches to achieve 95% efficiency.
|
|
||||||
|
### 2.4 Expert Parallelism (Mixture of Experts)
|
||||||
|
|
||||||
|
::: {.callout-note appearance="simple" icon=false}
|
||||||
|
**💡 Intuition: Breaking the Iron Law**
|
||||||
|
Standard dense Transformers obey a strict "Iron Law": if you double the parameters, you double the memory *and* the compute FLOPs. Mixture of Experts (MoE) breaks this law. It routes tokens only to specific "expert" subnetworks. This means your **Memory Bound** is dictated by the massive *Total Parameters*, but your **Compute Bound** is dictated only by the much smaller *Active Parameters*. The physical tradeoff is a massive network bandwidth tax (All-to-All communication) to route tokens to the right experts across the cluster.
|
||||||
|
|
||||||
|
**📚 Source:** @shazeer2017outrageously
|
||||||
|
:::
|
||||||
|
|
||||||
|
To model MoE, we move from 3D to **4D Parallelism**:
|
||||||
|
|
||||||
|
$$
|
||||||
|
\text{Data Parallelism} = \frac{\text{Total GPUs}}{TP \times PP \times EP}
|
||||||
|
$$
|
||||||
|
|
||||||
|
Where $EP$ is Expert Parallelism. If $EP > 1$, the solver adds an All-to-All communication penalty for token routing:
|
||||||
|
|
||||||
|
$$
|
||||||
|
T_{\text{all-to-all}} = \frac{N-1}{N} \times \frac{\text{Message Size}}{\text{Bandwidth}} + (N-1) \times \text{Latency}
|
||||||
|
$$
|
||||||
|
|
||||||
---
|
---
|
||||||
|
|
||||||
@@ -198,6 +220,27 @@ Where:
|
|||||||
|
|
||||||
---
|
---
|
||||||
|
|
||||||
|
## 6. Cluster Reliability (The Young-Daly Model)
|
||||||
|
|
||||||
|
*Implemented in [`mlsysim.core.solver.ReliabilitySolver`](api/core.solver.ReliabilitySolver.qmd).*
|
||||||
|
|
||||||
|
::: {.callout-note appearance="simple" icon=false}
|
||||||
|
**💡 Intuition: The Cost of Checkpointing**
|
||||||
|
When training massive models on thousands of GPUs for months, hardware failures are not a possibility; they are a statistical certainty. If a node fails, the job crashes and you lose all progress since the last checkpoint. You want to save checkpoints frequently to minimize lost work, but writing a 140GB checkpoint to remote storage takes time, pausing the training. The Young-Daly model calculates the optimal balance between *time wasted saving checkpoints* and *time wasted re-computing after a failure*.
|
||||||
|
|
||||||
|
**📚 Source:** @young1974first and @daly2006higher
|
||||||
|
:::
|
||||||
|
|
||||||
|
The optimal checkpoint interval $\tau_{\text{opt}}$ is defined by the Mean Time Between Failures ($M$) and the time it takes to write a single checkpoint ($\delta$):
|
||||||
|
|
||||||
|
$$
|
||||||
|
\tau_{\text{opt}} = \sqrt{2 \times \delta \times M}
|
||||||
|
$$
|
||||||
|
|
||||||
|
For a cluster, the collective $M$ drops linearly with the number of components. If a single node has an MTBF of 10,000 hours, a cluster of 1,000 nodes will have an MTBF of just 10 hours ($10,000 / 1000$).
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
::: {.callout-note}
|
::: {.callout-note}
|
||||||
## Limitations of First-Order Models
|
## Limitations of First-Order Models
|
||||||
These equations are first-order analytical models. They assume:
|
These equations are first-order analytical models. They assume:
|
||||||
|
|||||||
@@ -166,3 +166,17 @@
|
|||||||
year = {2019},
|
year = {2019},
|
||||||
doi = {10.1109/ICCAD45719.2019.8942149}
|
doi = {10.1109/ICCAD45719.2019.8942149}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@article{narayanan2021efficient,
|
||||||
|
title = {Efficient Large-Scale Language Model Training on GPU Clusters Using Megatron-LM},
|
||||||
|
author = {Narayanan, Deepak and Shoeybi, Mohammad and Casper, Jared and others},
|
||||||
|
journal = {Proceedings of the International Conference for High Performance Computing, Networking, Storage and Analysis (SC)},
|
||||||
|
year = {2021}
|
||||||
|
}
|
||||||
|
|
||||||
|
@article{shazeer2017outrageously,
|
||||||
|
title = {Outrageously Large Neural Networks: The Sparsely-Gated Mixture-of-Experts Layer},
|
||||||
|
author = {Shazeer, Noam and Mirhoseini, Azalia and Maziarz, Krzysztof and others},
|
||||||
|
journal = {arXiv preprint arXiv:1701.06538},
|
||||||
|
year = {2017}
|
||||||
|
}
|
||||||
|
|||||||
@@ -2,7 +2,6 @@
|
|||||||
title: "Which Solver Do I Need?"
|
title: "Which Solver Do I Need?"
|
||||||
subtitle: "A decision guide for choosing the right MLSYSIM analytical tool."
|
subtitle: "A decision guide for choosing the right MLSYSIM analytical tool."
|
||||||
---
|
---
|
||||||
|
|
||||||
MLSYSIM provides six specialized solvers, each designed to answer a different class of question about ML systems. This page helps you pick the right one.
|
MLSYSIM provides six specialized solvers, each designed to answer a different class of question about ML systems. This page helps you pick the right one.
|
||||||
|
|
||||||
---
|
---
|
||||||
|
|||||||
@@ -28,14 +28,14 @@
|
|||||||
.im-hero {
|
.im-hero {
|
||||||
background: linear-gradient(165deg, #0f172a 0%, #1e293b 100%);
|
background: linear-gradient(165deg, #0f172a 0%, #1e293b 100%);
|
||||||
color: white;
|
color: white;
|
||||||
padding: 4.5rem 2rem 3rem;
|
padding: 6.5rem 2rem 4rem;
|
||||||
position: relative;
|
position: relative;
|
||||||
overflow: hidden;
|
overflow: hidden;
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Carousel + stats portion: no extra padding on top, smooth continuation */
|
/* Carousel + stats portion: no extra padding on top, smooth continuation */
|
||||||
.im-hero.im-hero-showcase {
|
.im-hero.im-hero-showcase {
|
||||||
padding: 0 2rem 3rem;
|
padding: 2rem 2rem 2rem;
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Subtle animated grid overlay */
|
/* Subtle animated grid overlay */
|
||||||
@@ -134,7 +134,7 @@
|
|||||||
font-size: 0.9rem;
|
font-size: 0.9rem;
|
||||||
color: #94a3b8;
|
color: #94a3b8;
|
||||||
line-height: 1.7;
|
line-height: 1.7;
|
||||||
margin-bottom: 0;
|
margin-bottom: 2rem;
|
||||||
}
|
}
|
||||||
|
|
||||||
/* ---------- INSTALL ROW ---------- */
|
/* ---------- INSTALL ROW ---------- */
|
||||||
@@ -185,7 +185,7 @@ code.im-cmd {
|
|||||||
justify-content: center;
|
justify-content: center;
|
||||||
gap: 0.75rem;
|
gap: 0.75rem;
|
||||||
flex-wrap: wrap;
|
flex-wrap: wrap;
|
||||||
margin-bottom: 0;
|
margin-bottom: 2rem;
|
||||||
animation: fade-up 0.6s ease both;
|
animation: fade-up 0.6s ease both;
|
||||||
animation-delay: 0.8s;
|
animation-delay: 0.8s;
|
||||||
}
|
}
|
||||||
@@ -227,14 +227,45 @@ code.im-cmd {
|
|||||||
/* ---------- CAPABILITY CAROUSEL ---------- */
|
/* ---------- CAPABILITY CAROUSEL ---------- */
|
||||||
.im-carousel {
|
.im-carousel {
|
||||||
max-width: 480px;
|
max-width: 480px;
|
||||||
margin: 0 auto 2.5rem;
|
margin: 2.5rem auto 1rem;
|
||||||
animation: fade-up 0.6s ease both;
|
animation: fade-up 0.6s ease both;
|
||||||
animation-delay: 0.9s;
|
animation-delay: 0.9s;
|
||||||
|
position: relative;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
.im-arrow {
|
||||||
|
position: absolute;
|
||||||
|
top: 50%;
|
||||||
|
transform: translateY(-50%);
|
||||||
|
z-index: 10;
|
||||||
|
background: rgba(255,255,255,0.06);
|
||||||
|
border: 1px solid rgba(255,255,255,0.15);
|
||||||
|
color: #94a3b8;
|
||||||
|
width: 36px;
|
||||||
|
height: 36px;
|
||||||
|
border-radius: 50%;
|
||||||
|
font-size: 1.3rem;
|
||||||
|
line-height: 1;
|
||||||
|
cursor: pointer;
|
||||||
|
display: flex;
|
||||||
|
align-items: center;
|
||||||
|
justify-content: center;
|
||||||
|
transition: all 150ms ease;
|
||||||
|
padding: 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
.im-arrow:hover {
|
||||||
|
background: rgba(255,255,255,0.12);
|
||||||
|
border-color: rgba(255,255,255,0.3);
|
||||||
|
color: white;
|
||||||
|
}
|
||||||
|
|
||||||
|
.im-arrow-prev { left: -52px; }
|
||||||
|
.im-arrow-next { right: -52px; }
|
||||||
|
|
||||||
.im-carousel-track {
|
.im-carousel-track {
|
||||||
position: relative;
|
position: relative;
|
||||||
min-height: 195px;
|
min-height: 280px;
|
||||||
}
|
}
|
||||||
|
|
||||||
.im-slide {
|
.im-slide {
|
||||||
@@ -242,10 +273,14 @@ code.im-cmd {
|
|||||||
top: 0;
|
top: 0;
|
||||||
left: 0;
|
left: 0;
|
||||||
right: 0;
|
right: 0;
|
||||||
|
bottom: 0;
|
||||||
opacity: 0;
|
opacity: 0;
|
||||||
transform: translateY(8px);
|
transform: translateY(8px);
|
||||||
transition: opacity 0.5s ease, transform 0.5s ease;
|
transition: opacity 0.5s ease, transform 0.5s ease;
|
||||||
pointer-events: none;
|
pointer-events: none;
|
||||||
|
display: flex;
|
||||||
|
flex-direction: column;
|
||||||
|
justify-content: center;
|
||||||
}
|
}
|
||||||
|
|
||||||
.im-slide-active {
|
.im-slide-active {
|
||||||
@@ -255,7 +290,7 @@ code.im-cmd {
|
|||||||
}
|
}
|
||||||
|
|
||||||
.im-slide-label {
|
.im-slide-label {
|
||||||
font-size: 0.7rem;
|
font-size: 0.75rem;
|
||||||
font-weight: 600;
|
font-weight: 600;
|
||||||
text-transform: uppercase;
|
text-transform: uppercase;
|
||||||
letter-spacing: 0.1em;
|
letter-spacing: 0.1em;
|
||||||
@@ -266,9 +301,10 @@ code.im-cmd {
|
|||||||
|
|
||||||
.im-slide-viz {
|
.im-slide-viz {
|
||||||
background: rgba(255,255,255,0.04);
|
background: rgba(255,255,255,0.04);
|
||||||
|
backdrop-filter: blur(8px);
|
||||||
border: 1px solid rgba(255,255,255,0.08);
|
border: 1px solid rgba(255,255,255,0.08);
|
||||||
border-radius: 10px;
|
border-radius: 10px;
|
||||||
padding: 0.75rem;
|
padding: 1.25rem;
|
||||||
margin-bottom: 0.6rem;
|
margin-bottom: 0.6rem;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -279,6 +315,7 @@ code.im-cmd {
|
|||||||
}
|
}
|
||||||
|
|
||||||
.im-slide-caption {
|
.im-slide-caption {
|
||||||
|
padding: 0 1rem;
|
||||||
font-size: 0.78rem;
|
font-size: 0.78rem;
|
||||||
color: #94a3b8;
|
color: #94a3b8;
|
||||||
text-align: center;
|
text-align: center;
|
||||||
@@ -289,7 +326,8 @@ code.im-cmd {
|
|||||||
display: flex;
|
display: flex;
|
||||||
justify-content: center;
|
justify-content: center;
|
||||||
gap: 0.5rem;
|
gap: 0.5rem;
|
||||||
margin-top: 1rem;
|
margin-top: 1.5rem;
|
||||||
|
margin-bottom: 0.75rem;
|
||||||
}
|
}
|
||||||
|
|
||||||
.im-dot {
|
.im-dot {
|
||||||
@@ -318,35 +356,41 @@ code.im-cmd {
|
|||||||
.im-stats {
|
.im-stats {
|
||||||
display: flex;
|
display: flex;
|
||||||
justify-content: center;
|
justify-content: center;
|
||||||
gap: 2.5rem;
|
gap: 2rem;
|
||||||
flex-wrap: wrap;
|
flex-wrap: wrap;
|
||||||
padding-top: 2rem;
|
margin: 2rem auto 0.5rem;
|
||||||
border-top: 1px solid rgba(255,255,255,0.08);
|
padding: 0;
|
||||||
animation: fade-up 0.6s ease both;
|
animation: fade-up 0.6s ease both;
|
||||||
animation-delay: 0.95s;
|
animation-delay: 0.6s;
|
||||||
}
|
}
|
||||||
|
|
||||||
.im-stat {
|
.im-stat {
|
||||||
text-align: center;
|
text-align: center;
|
||||||
|
padding: 0.5rem 1rem;
|
||||||
|
background: rgba(255, 255, 255, 0.03);
|
||||||
|
border-radius: 8px;
|
||||||
|
border: 1px solid rgba(255, 255, 255, 0.05);
|
||||||
}
|
}
|
||||||
|
|
||||||
.im-stat-num {
|
.im-stat-num {
|
||||||
|
font-size: 1.25rem; /* Adjusted for text-based stats */
|
||||||
display: block;
|
display: block;
|
||||||
font-size: 1.75rem;
|
font-size: 1.5rem;
|
||||||
font-weight: 800;
|
font-weight: 800;
|
||||||
color: #38bdf8;
|
color: #38bdf8;
|
||||||
letter-spacing: -0.02em;
|
letter-spacing: -0.02em;
|
||||||
line-height: 1.1;
|
line-height: 1.1;
|
||||||
|
text-shadow: 0 0 20px rgba(56, 189, 248, 0.2);
|
||||||
}
|
}
|
||||||
|
|
||||||
.im-stat-label {
|
.im-stat-label {
|
||||||
display: block;
|
display: block;
|
||||||
font-size: 0.72rem;
|
font-size: 0.65rem;
|
||||||
font-weight: 500;
|
font-weight: 600;
|
||||||
color: #64748b;
|
color: #94a3b8;
|
||||||
text-transform: uppercase;
|
text-transform: uppercase;
|
||||||
letter-spacing: 0.08em;
|
letter-spacing: 0.05em;
|
||||||
margin-top: 0.3rem;
|
margin-top: 0.2rem;
|
||||||
}
|
}
|
||||||
|
|
||||||
/* ---------- CONTENT CONTAINER ---------- */
|
/* ---------- CONTENT CONTAINER ---------- */
|
||||||
@@ -393,7 +437,7 @@ code.im-cmd {
|
|||||||
display: grid;
|
display: grid;
|
||||||
grid-template-columns: repeat(auto-fit, minmax(250px, 1fr));
|
grid-template-columns: repeat(auto-fit, minmax(250px, 1fr));
|
||||||
gap: 0.875rem;
|
gap: 0.875rem;
|
||||||
margin-top: 1rem;
|
margin-top: 1.5rem;
|
||||||
}
|
}
|
||||||
|
|
||||||
.im-solver-card {
|
.im-solver-card {
|
||||||
@@ -440,7 +484,7 @@ code.im-cmd {
|
|||||||
display: grid;
|
display: grid;
|
||||||
grid-template-columns: repeat(auto-fill, minmax(240px, 1fr));
|
grid-template-columns: repeat(auto-fill, minmax(240px, 1fr));
|
||||||
gap: 1rem;
|
gap: 1rem;
|
||||||
margin-top: 1rem;
|
margin-top: 1.5rem;
|
||||||
}
|
}
|
||||||
|
|
||||||
.im-tutorial-card {
|
.im-tutorial-card {
|
||||||
@@ -502,7 +546,7 @@ code.im-cmd {
|
|||||||
display: grid;
|
display: grid;
|
||||||
grid-template-columns: repeat(auto-fit, minmax(240px, 1fr));
|
grid-template-columns: repeat(auto-fit, minmax(240px, 1fr));
|
||||||
gap: 1.5rem;
|
gap: 1.5rem;
|
||||||
margin-top: 1rem;
|
margin-top: 1.5rem;
|
||||||
}
|
}
|
||||||
|
|
||||||
.im-audience-item {
|
.im-audience-item {
|
||||||
@@ -529,6 +573,30 @@ code.im-cmd {
|
|||||||
.im-aud-instructor { border-left-color: #d97706; }
|
.im-aud-instructor { border-left-color: #d97706; }
|
||||||
.im-aud-engineer { border-left-color: #059669; }
|
.im-aud-engineer { border-left-color: #059669; }
|
||||||
|
|
||||||
|
.im-audience-item a {
|
||||||
|
text-decoration: none !important;
|
||||||
|
}
|
||||||
|
|
||||||
|
.im-audience-item a strong {
|
||||||
|
color: #1e293b;
|
||||||
|
transition: color 0.15s ease;
|
||||||
|
}
|
||||||
|
|
||||||
|
.im-audience-item a:hover strong {
|
||||||
|
color: #0284c7;
|
||||||
|
}
|
||||||
|
|
||||||
|
.im-audience-item p a {
|
||||||
|
font-size: 0.82rem;
|
||||||
|
font-weight: 600;
|
||||||
|
color: #0284c7 !important;
|
||||||
|
text-decoration: none !important;
|
||||||
|
}
|
||||||
|
|
||||||
|
.im-audience-item p a:hover {
|
||||||
|
text-decoration: underline !important;
|
||||||
|
}
|
||||||
|
|
||||||
/* ---------- RESPONSIVE ---------- */
|
/* ---------- RESPONSIVE ---------- */
|
||||||
@media (max-width: 768px) {
|
@media (max-width: 768px) {
|
||||||
.im-hero {
|
.im-hero {
|
||||||
@@ -539,6 +607,16 @@ code.im-cmd {
|
|||||||
padding: 0 1.5rem 2rem;
|
padding: 0 1.5rem 2rem;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
.im-arrow-prev { left: -6px; }
|
||||||
|
.im-arrow-next { right: -6px; }
|
||||||
|
.im-arrow {
|
||||||
|
width: 30px;
|
||||||
|
height: 30px;
|
||||||
|
font-size: 1.1rem;
|
||||||
|
background: rgba(15,23,42,0.8);
|
||||||
|
backdrop-filter: blur(4px);
|
||||||
|
}
|
||||||
|
|
||||||
.im-title {
|
.im-title {
|
||||||
font-size: clamp(2.25rem, 8vw, 3rem);
|
font-size: clamp(2.25rem, 8vw, 3rem);
|
||||||
}
|
}
|
||||||
@@ -552,6 +630,8 @@ code.im-cmd {
|
|||||||
}
|
}
|
||||||
|
|
||||||
.im-stat-num {
|
.im-stat-num {
|
||||||
|
font-size: 1.25rem; /* Adjusted for text-based stats */
|
||||||
|
text-shadow: 0 0 20px rgba(56, 189, 248, 0.3);
|
||||||
font-size: 1.4rem;
|
font-size: 1.4rem;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -596,3 +676,67 @@ code.im-cmd {
|
|||||||
max-width: 240px;
|
max-width: 240px;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* Overriding stats for top placement */
|
||||||
|
|
||||||
|
.im-stats {
|
||||||
|
display: grid;
|
||||||
|
grid-template-columns: repeat(4, 1fr);
|
||||||
|
gap: 1rem;
|
||||||
|
max-width: 800px;
|
||||||
|
margin: 2.5rem auto 1.5rem;
|
||||||
|
padding: 0;
|
||||||
|
animation: fade-up 0.6s ease both;
|
||||||
|
animation-delay: 0.6s;
|
||||||
|
}
|
||||||
|
|
||||||
|
.im-stat {
|
||||||
|
text-align: center;
|
||||||
|
padding: 0.75rem 0.5rem;
|
||||||
|
background: rgba(255, 255, 255, 0.03);
|
||||||
|
border-radius: 10px;
|
||||||
|
border: 1px solid rgba(255, 255, 255, 0.06);
|
||||||
|
backdrop-filter: blur(4px);
|
||||||
|
transition: transform 0.2s ease, background 0.2s ease;
|
||||||
|
}
|
||||||
|
|
||||||
|
.im-stat:hover {
|
||||||
|
background: rgba(255, 255, 255, 0.05);
|
||||||
|
transform: translateY(-2px);
|
||||||
|
}
|
||||||
|
|
||||||
|
.im-stat-num {
|
||||||
|
font-size: 1.25rem; /* Adjusted for text-based stats */
|
||||||
|
display: block;
|
||||||
|
font-size: clamp(1rem, 2.5vw, 1.5rem);
|
||||||
|
font-weight: 800;
|
||||||
|
color: #38bdf8;
|
||||||
|
letter-spacing: -0.02em;
|
||||||
|
line-height: 1.1;
|
||||||
|
}
|
||||||
|
|
||||||
|
.im-stat-label {
|
||||||
|
display: block;
|
||||||
|
font-size: 0.6rem;
|
||||||
|
font-weight: 700;
|
||||||
|
color: #94a3b8;
|
||||||
|
text-transform: uppercase;
|
||||||
|
letter-spacing: 0.08em;
|
||||||
|
margin-top: 0.25rem;
|
||||||
|
white-space: nowrap;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Stable wrapping for smaller screens */
|
||||||
|
@media (max-width: 768px) {
|
||||||
|
.im-stats {
|
||||||
|
grid-template-columns: repeat(2, 1fr);
|
||||||
|
max-width: 400px;
|
||||||
|
gap: 0.75rem;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@media (max-width: 400px) {
|
||||||
|
.im-stat-num {
|
||||||
|
font-size: 1.25rem; /* Adjusted for text-based stats */ font-size: 1.3rem; }
|
||||||
|
.im-stat-label { font-size: 0.55rem; }
|
||||||
|
}
|
||||||
|
|||||||
@@ -2,7 +2,6 @@
|
|||||||
title: "Distributed Training: 3D Parallelism and Scaling Efficiency"
|
title: "Distributed Training: 3D Parallelism and Scaling Efficiency"
|
||||||
subtitle: "Discover why 1024 GPUs rarely deliver 1024× speedup — and how to minimize the gap."
|
subtitle: "Discover why 1024 GPUs rarely deliver 1024× speedup — and how to minimize the gap."
|
||||||
---
|
---
|
||||||
|
|
||||||
::: {.callout-note}
|
::: {.callout-note}
|
||||||
## Background: Why distributed training?
|
## Background: Why distributed training?
|
||||||
|
|
||||||
@@ -139,11 +138,11 @@ result_dp = solver.solve(
|
|||||||
)
|
)
|
||||||
|
|
||||||
node_perf = result_dp["node_performance"]
|
node_perf = result_dp["node_performance"]
|
||||||
print(f"Single-GPU compute time: {node_perf.latency.to('ms'):.1f} ms/step")
|
print(f"Single-GPU compute time: {node_perf.latency.to('ms'):~.1f}/step")
|
||||||
print(f"DP all-reduce overhead: {result_dp['dp_communication_latency'].to('ms'):.2f} ms")
|
print(f"DP all-reduce overhead: {result_dp['dp_communication_latency'].to('ms'):~.2f}")
|
||||||
print(f"Pipeline bubble: {result_dp['pipeline_bubble_latency'].to('ms'):.2f} ms")
|
print(f"Pipeline bubble: {result_dp['pipeline_bubble_latency'].to('ms'):~.2f}")
|
||||||
print(f"")
|
print(f"")
|
||||||
print(f"Total step latency: {result_dp['step_latency_total'].to('ms'):.1f} ms")
|
print(f"Total step latency: {result_dp['step_latency_total'].to('ms'):~.1f}")
|
||||||
print(f"Scaling efficiency: {result_dp['scaling_efficiency']:.1%}")
|
print(f"Scaling efficiency: {result_dp['scaling_efficiency']:.1%}")
|
||||||
print(f"Effective throughput: {result_dp['effective_throughput'].magnitude:.0f} samples/s")
|
print(f"Effective throughput: {result_dp['effective_throughput'].magnitude:.0f} samples/s")
|
||||||
print(f"Parallelism: DP={result_dp['parallelism']['dp']} TP={result_dp['parallelism']['tp']} PP={result_dp['parallelism']['pp']}")
|
print(f"Parallelism: DP={result_dp['parallelism']['dp']} TP={result_dp['parallelism']['tp']} PP={result_dp['parallelism']['pp']}")
|
||||||
@@ -166,12 +165,12 @@ network bandwidth.
|
|||||||
## 4. Ring All-Reduce: The Network Tax
|
## 4. Ring All-Reduce: The Network Tax
|
||||||
|
|
||||||
The `DP all-reduce overhead` comes from the **ring all-reduce algorithm**, which is the
|
The `DP all-reduce overhead` comes from the **ring all-reduce algorithm**, which is the
|
||||||
standard method for gradient synchronization. Its time depends on:
|
standard method for gradient synchronization.
|
||||||
|
|
||||||
$$t_{\text{allreduce}} = 2 \times \frac{M \times (N-1)}{N \times B_{\text{eff}}}$$
|
::: {.callout-note}
|
||||||
|
## 🧮 See the Math
|
||||||
Where $M$ is the message size (model gradient = 2× weights in fp16), $N$ is the number
|
For the full equation deriving All-Reduce overhead from model size, node count, and fabric bandwidth, see the [Mathematical Foundations: Ring All-Reduce](../math.qmd#ring-all-reduce-data-parallelism).
|
||||||
of data-parallel replicas, and $B_{\text{eff}}$ is the effective inter-node bandwidth.
|
:::
|
||||||
|
|
||||||
The following sweep shows how fabric bandwidth affects overhead:
|
The following sweep shows how fabric bandwidth affects overhead:
|
||||||
|
|
||||||
@@ -228,9 +227,10 @@ The downside: a **pipeline bubble**. The first microbatch must flow through all
|
|||||||
the last stage can start processing the second microbatch. During that startup phase, most
|
the last stage can start processing the second microbatch. During that startup phase, most
|
||||||
GPUs are idle.
|
GPUs are idle.
|
||||||
|
|
||||||
$$\text{Bubble fraction} = \frac{P - 1}{P - 1 + M}$$
|
::: {.callout-note}
|
||||||
|
## 🧮 See the Math
|
||||||
Where $P$ is the pipeline depth (number of stages) and $M$ is the number of microbatches.
|
For the full equation governing pipeline bubbles and interleaved 1F1B schedules, see the [Mathematical Foundations: Pipeline Parallelism Bubble](../math.qmd#pipeline-parallelism-bubble).
|
||||||
|
:::
|
||||||
|
|
||||||
```{python}
|
```{python}
|
||||||
print(f"{'PP stages':>10} {'Microbatches':>13} {'Bubble %':>9} {'Comm (ms)':>10} {'Efficiency':>11}")
|
print(f"{'PP stages':>10} {'Microbatches':>13} {'Bubble %':>9} {'Comm (ms)':>10} {'Efficiency':>11}")
|
||||||
|
|||||||
@@ -2,7 +2,6 @@
|
|||||||
title: "Hello World: Single-Node Roofline"
|
title: "Hello World: Single-Node Roofline"
|
||||||
subtitle: "Predict model performance on hardware before writing a single CUDA kernel."
|
subtitle: "Predict model performance on hardware before writing a single CUDA kernel."
|
||||||
---
|
---
|
||||||
|
|
||||||
::: {.callout-note}
|
::: {.callout-note}
|
||||||
## Prerequisites
|
## Prerequisites
|
||||||
Complete the [Getting Started](../getting-started.qmd) guide before this tutorial. It introduces the `Engine.solve` API and the MLSys Zoo.
|
Complete the [Getting Started](../getting-started.qmd) guide before this tutorial. It introduces the `Engine.solve` API and the MLSys Zoo.
|
||||||
@@ -98,7 +97,7 @@ profile = Engine.solve(
|
|||||||
)
|
)
|
||||||
|
|
||||||
print(f"Bottleneck: {profile.bottleneck}")
|
print(f"Bottleneck: {profile.bottleneck}")
|
||||||
print(f"Latency: {profile.latency.to('ms'):.3f} ms per inference")
|
print(f"Latency: {profile.latency.to('ms'):~.3f} per inference")
|
||||||
print(f"Throughput: {profile.throughput:.0f} images/sec")
|
print(f"Throughput: {profile.throughput:.0f} images/sec")
|
||||||
```
|
```
|
||||||
|
|
||||||
@@ -129,7 +128,7 @@ for batch in [1, 4, 16, 32, 64, 128, 256]:
|
|||||||
print(
|
print(
|
||||||
f"{batch:>6} {p.bottleneck:<16} "
|
f"{batch:>6} {p.bottleneck:<16} "
|
||||||
f"{p.throughput:>10.0f}/s "
|
f"{p.throughput:>10.0f}/s "
|
||||||
f"{p.latency.to('ms'):>8.2f} ms"
|
f"{p.latency.to('ms').magnitude:>8.2f} ms"
|
||||||
)
|
)
|
||||||
```
|
```
|
||||||
|
|
||||||
|
|||||||
Binary file not shown.
|
Before Width: | Height: | Size: 51 KiB After Width: | Height: | Size: 90 KiB |
@@ -1,11 +1,7 @@
|
|||||||
---
|
---
|
||||||
title: "Tutorials"
|
title: "Tutorials"
|
||||||
subtitle: "Step-by-step guides for modeling ML Systems."
|
subtitle: "Step-by-step guides for modeling ML Systems."
|
||||||
format:
|
|
||||||
html:
|
|
||||||
toc: false
|
|
||||||
---
|
---
|
||||||
|
|
||||||
These tutorials are designed to build intuition for ML systems using the `mlsysim` framework.
|
These tutorials are designed to build intuition for ML systems using the `mlsysim` framework.
|
||||||
They map directly to chapters in the *Machine Learning Systems* textbook—start at the beginning
|
They map directly to chapters in the *Machine Learning Systems* textbook—start at the beginning
|
||||||
or jump to any topic.
|
or jump to any topic.
|
||||||
|
|||||||
@@ -2,7 +2,6 @@
|
|||||||
title: "LLM Serving Lab: TTFT, ITL, and the Memory Wall"
|
title: "LLM Serving Lab: TTFT, ITL, and the Memory Wall"
|
||||||
subtitle: "Model the two physical regimes of LLM inference before deploying a single server."
|
subtitle: "Model the two physical regimes of LLM inference before deploying a single server."
|
||||||
---
|
---
|
||||||
|
|
||||||
::: {.callout-note}
|
::: {.callout-note}
|
||||||
## Background: What is an LLM and why is serving different?
|
## Background: What is an LLM and why is serving different?
|
||||||
|
|
||||||
@@ -139,12 +138,12 @@ print(f"Memory util: {result['memory_utilization']:.1%}")
|
|||||||
## 3. The KV-Cache Memory Wall
|
## 3. The KV-Cache Memory Wall
|
||||||
|
|
||||||
The KV-cache stores the Key and Value matrices from every attention layer for every token
|
The KV-cache stores the Key and Value matrices from every attention layer for every token
|
||||||
in the active context. Its size grows as:
|
in the active context. This statefulness is what makes LLM decoding uniquely memory-bound.
|
||||||
|
|
||||||
$$\text{KV-Cache} = 2 \times L \times H_{kv} \times d_{head} \times S \times B \times \text{bpp}$$
|
::: {.callout-note}
|
||||||
|
## 🧮 See the Math
|
||||||
Where $L$ = layers, $H_{kv}$ = KV heads, $S$ = sequence length, $B$ = batch size,
|
To see the exact formula for how KV-Cache size scales with sequence length, batch size, and network architecture, see the [Mathematical Foundations: KV-Cache Size](../math.qmd#kv-cache-size).
|
||||||
$\text{bpp}$ = bytes per parameter.
|
:::
|
||||||
|
|
||||||
This means doubling `batch_size` doubles the KV-cache. At some point, you hit the
|
This means doubling `batch_size` doubles the KV-cache. At some point, you hit the
|
||||||
**memory wall** — the combined model + KV-cache exceeds the accelerator's HBM capacity.
|
**memory wall** — the combined model + KV-cache exceeds the accelerator's HBM capacity.
|
||||||
|
|||||||
@@ -2,7 +2,6 @@
|
|||||||
title: "Sustainability Lab: Modeling Carbon Footprint"
|
title: "Sustainability Lab: Modeling Carbon Footprint"
|
||||||
subtitle: "Same model, same hardware — 41x difference in carbon footprint."
|
subtitle: "Same model, same hardware — 41x difference in carbon footprint."
|
||||||
---
|
---
|
||||||
|
|
||||||
::: {.callout-note}
|
::: {.callout-note}
|
||||||
## Prerequisites
|
## Prerequisites
|
||||||
This tutorial can be completed independently, but completing the [Hello World tutorial](hello_world.qmd) first provides useful context on how hardware performance relates to energy consumption.
|
This tutorial can be completed independently, but completing the [Hello World tutorial](hello_world.qmd) first provides useful context on how hardware performance relates to energy consumption.
|
||||||
|
|||||||
@@ -6,7 +6,6 @@ affiliation: "Harvard University"
|
|||||||
bibliography: references.bib
|
bibliography: references.bib
|
||||||
csl: https://raw.githubusercontent.com/citation-style-language/styles/master/ieee.csl
|
csl: https://raw.githubusercontent.com/citation-style-language/styles/master/ieee.csl
|
||||||
---
|
---
|
||||||
|
|
||||||
## Abstract
|
## Abstract
|
||||||
|
|
||||||
Machine learning systems education faces a practical gap: the hardware students need to reason about — H100 clusters, InfiniBand fabrics, multi-megawatt datacenters — is inaccessible for hands-on experimentation. We present **MLSYSIM**, a first-principles analytical engine designed as the companion framework to the *Machine Learning Systems* textbook [@mlsysbook2024]. MLSYSIM provides six composable solvers covering single-node performance (Roofline), distributed training (3D Parallelism), LLM serving (Pre-fill vs. Decode), Total Cost of Ownership, carbon footprint, and cluster reliability. All quantities carry physical units via `pint.Quantity` types, enforcing dimensional correctness at runtime. A vetted registry of 18 hardware devices, 15 model architectures, and 4 regional grid profiles provides a single source of truth that keeps textbook exercises grounded in real-world specifications. The platform is open source and available at [mlsysbook.ai](https://mlsysbook.ai).
|
Machine learning systems education faces a practical gap: the hardware students need to reason about — H100 clusters, InfiniBand fabrics, multi-megawatt datacenters — is inaccessible for hands-on experimentation. We present **MLSYSIM**, a first-principles analytical engine designed as the companion framework to the *Machine Learning Systems* textbook [@mlsysbook2024]. MLSYSIM provides six composable solvers covering single-node performance (Roofline), distributed training (3D Parallelism), LLM serving (Pre-fill vs. Decode), Total Cost of Ownership, carbon footprint, and cluster reliability. All quantities carry physical units via `pint.Quantity` types, enforcing dimensional correctness at runtime. A vetted registry of 18 hardware devices, 15 model architectures, and 4 regional grid profiles provides a single source of truth that keeps textbook exercises grounded in real-world specifications. The platform is open source and available at [mlsysbook.ai](https://mlsysbook.ai).
|
||||||
@@ -62,6 +61,7 @@ This paper makes three contributions:
|
|||||||
MLSYSIM organizes the ML systems domain into five composable layers, following a strategy we call **Progressive Lowering**: abstract workload demand is progressively mapped onto concrete hardware supply through intermediate representations.
|
MLSYSIM organizes the ML systems domain into five composable layers, following a strategy we call **Progressive Lowering**: abstract workload demand is progressively mapped onto concrete hardware supply through intermediate representations.
|
||||||
|
|
||||||
```{mermaid}
|
```{mermaid}
|
||||||
|
%%{init: {'theme': 'neutral'}}%%
|
||||||
%%| fig-cap: "The MLSYSIM 5-Layer Stack. Workloads (demand) are lowered onto Hardware (supply) through Infrastructure and Systems layers. Solvers bridge demand and supply to produce analytical profiles."
|
%%| fig-cap: "The MLSYSIM 5-Layer Stack. Workloads (demand) are lowered onto Hardware (supply) through Infrastructure and Systems layers. Solvers bridge demand and supply to produce analytical profiles."
|
||||||
%%| fig-width: 100%
|
%%| fig-width: 100%
|
||||||
flowchart TB
|
flowchart TB
|
||||||
@@ -166,7 +166,7 @@ LLM inference has two physically distinct phases:
|
|||||||
|
|
||||||
2. **Decode** (Memory-Bound): Each token requires reading all model weights plus the KV-cache from HBM. Latency per token scales with `(weight_bytes + kv_cache_bytes) / bandwidth`. This determines Inter-Token Latency (ITL).
|
2. **Decode** (Memory-Bound): Each token requires reading all model weights plus the KV-cache from HBM. Latency per token scales with `(weight_bytes + kv_cache_bytes) / bandwidth`. This determines Inter-Token Latency (ITL).
|
||||||
|
|
||||||
The solver also computes KV-cache memory:
|
The solver also computes KV-cache memory [@kwon2023efficient]:
|
||||||
|
|
||||||
$$\text{KV-cache} = 2 \times n_\text{layers} \times n_\text{kv\_heads} \times d_\text{head} \times \text{seq\_len} \times \text{batch} \times \text{bytes/element}$$
|
$$\text{KV-cache} = 2 \times n_\text{layers} \times n_\text{kv\_heads} \times d_\text{head} \times \text{seq\_len} \times \text{batch} \times \text{bytes/element}$$
|
||||||
|
|
||||||
@@ -182,17 +182,17 @@ For fleet-scale training, the solver decomposes the workload using three paralle
|
|||||||
|
|
||||||
The total accelerator count constrains the decomposition: `dp_size * tp_size * pp_size = total_accelerators`.
|
The total accelerator count constrains the decomposition: `dp_size * tp_size * pp_size = total_accelerators`.
|
||||||
|
|
||||||
**Communication overhead** is modeled using the ring all-reduce formula:
|
**Communication overhead** is modeled using the ring all-reduce formula [@dean2012large]:
|
||||||
|
|
||||||
$$T_{\text{ring}} = 2 \cdot \frac{N-1}{N} \cdot \frac{S}{\text{BW}} + 2(N-1) \cdot \alpha$$
|
$$T_{\text{ring}} = 2 \cdot \frac{N-1}{N} \cdot \frac{S}{\text{BW}} + 2(N-1) \cdot \alpha$$
|
||||||
|
|
||||||
where $N$ is the number of workers, $S$ is the message size (gradient tensor bytes), BW is the effective fabric bandwidth (accounting for oversubscription), and $\alpha$ is the per-message latency.
|
where $N$ is the number of workers, $S$ is the message size (gradient tensor bytes), BW is the effective fabric bandwidth (accounting for oversubscription), and $\alpha$ is the per-message latency.
|
||||||
|
|
||||||
**Pipeline bubble fraction** follows the standard model:
|
**Pipeline bubble fraction** follows the interleaved pipeline model [@narayanan2021efficient]:
|
||||||
|
|
||||||
$$\text{Bubble} = \frac{P - 1}{P - 1 + M}$$
|
$$\text{Bubble} = \frac{P - 1}{V \times M + P - 1}$$
|
||||||
|
|
||||||
where $P$ is the pipeline depth and $M$ is the number of microbatches.
|
where $P$ is the pipeline depth, $M$ is the number of microbatches, and $V$ is the number of virtual stages per GPU.
|
||||||
|
|
||||||
**Scaling efficiency** is computed as:
|
**Scaling efficiency** is computed as:
|
||||||
|
|
||||||
@@ -244,6 +244,7 @@ where $\delta$ is the time to save one checkpoint.
|
|||||||
Real-world questions often require chaining multiple solvers. For example, answering "Can I serve Llama-70B on 4x H100s, and what will it cost?" requires the ServingSolver (feasibility and latency) followed by the EconomicsSolver (per-query cost). Similarly, "What is the most sustainable way to train GPT-3?" chains the DistributedSolver (optimal parallelism) with the SustainabilitySolver (carbon by region).
|
Real-world questions often require chaining multiple solvers. For example, answering "Can I serve Llama-70B on 4x H100s, and what will it cost?" requires the ServingSolver (feasibility and latency) followed by the EconomicsSolver (per-query cost). Similarly, "What is the most sustainable way to train GPT-3?" chains the DistributedSolver (optimal parallelism) with the SustainabilitySolver (carbon by region).
|
||||||
|
|
||||||
```{mermaid}
|
```{mermaid}
|
||||||
|
%%{init: {'theme': 'neutral'}}%%
|
||||||
%%| fig-cap: "Solver composition for compound questions. Each solver's output feeds the next, enabling multi-dimensional analysis."
|
%%| fig-cap: "Solver composition for compound questions. Each solver's output feeds the next, enabling multi-dimensional analysis."
|
||||||
%%| fig-width: 100%
|
%%| fig-width: 100%
|
||||||
flowchart LR
|
flowchart LR
|
||||||
|
|||||||
88
mlsysim/docs/zoo/composition.svg
Normal file
88
mlsysim/docs/zoo/composition.svg
Normal file
@@ -0,0 +1,88 @@
|
|||||||
|
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 600 500" width="100%" height="100%" style="background-color: white; font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, Helvetica, Arial, sans-serif;">
|
||||||
|
<defs>
|
||||||
|
<marker id="arrowhead" markerWidth="10" markerHeight="7" refX="9" refY="3.5" orient="auto">
|
||||||
|
<polygon points="0 0, 10 3.5, 0 7" fill="#64748b" />
|
||||||
|
</marker>
|
||||||
|
<style>
|
||||||
|
.node-rect { stroke-width: 1.5; rx: 6; ry: 6; }
|
||||||
|
.node-text-title { font-size: 14px; font-weight: 600; fill: #0f172a; text-anchor: middle; }
|
||||||
|
.node-text-sub { font-size: 11px; fill: #475569; text-anchor: middle; }
|
||||||
|
.edge-line { stroke: #94a3b8; stroke-width: 1.5; fill: none; }
|
||||||
|
.edge-label { font-size: 11px; fill: #64748b; text-anchor: middle; font-weight: 500; }
|
||||||
|
.bg-rect { fill: white; }
|
||||||
|
</style>
|
||||||
|
</defs>
|
||||||
|
|
||||||
|
<!-- HardwareNode -->
|
||||||
|
<g transform="translate(240, 40)">
|
||||||
|
<rect width="120" height="50" class="node-rect" fill="#e0f2fe" stroke="#38bdf8" />
|
||||||
|
<text x="60" y="22" class="node-text-title">HardwareNode</text>
|
||||||
|
<text x="60" y="38" class="node-text-sub">Silicon / Chip</text>
|
||||||
|
</g>
|
||||||
|
|
||||||
|
<!-- Edge: HardwareNode to Node -->
|
||||||
|
<path d="M 300 90 L 300 130" class="edge-line" marker-end="url(#arrowhead)" />
|
||||||
|
<rect x="260" y="102" width="80" height="16" fill="white" />
|
||||||
|
<text x="300" y="114" class="edge-label">accelerates</text>
|
||||||
|
|
||||||
|
<!-- Node -->
|
||||||
|
<g transform="translate(240, 140)">
|
||||||
|
<rect width="120" height="50" class="node-rect" fill="#f8fafc" stroke="#cbd5e1" />
|
||||||
|
<text x="60" y="22" class="node-text-title">Node</text>
|
||||||
|
<text x="60" y="38" class="node-text-sub">Server Chassis</text>
|
||||||
|
</g>
|
||||||
|
|
||||||
|
<!-- NetworkFabric -->
|
||||||
|
<g transform="translate(80, 140)">
|
||||||
|
<rect width="120" height="50" class="node-rect" fill="#f8fafc" stroke="#cbd5e1" />
|
||||||
|
<text x="60" y="22" class="node-text-title">NetworkFabric</text>
|
||||||
|
<text x="60" y="38" class="node-text-sub">Interconnect</text>
|
||||||
|
</g>
|
||||||
|
|
||||||
|
<!-- Edge: Node to Fleet -->
|
||||||
|
<path d="M 300 190 L 300 240" class="edge-line" marker-end="url(#arrowhead)" />
|
||||||
|
<rect x="265" y="207" width="70" height="16" fill="white" />
|
||||||
|
<text x="300" y="219" class="edge-label">composes</text>
|
||||||
|
|
||||||
|
<!-- Edge: NetworkFabric to Fleet -->
|
||||||
|
<path d="M 140 190 C 140 215, 230 215, 230 245" class="edge-line" marker-end="url(#arrowhead)" />
|
||||||
|
<rect x="145" y="215" width="60" height="16" fill="white" />
|
||||||
|
<text x="175" y="227" class="edge-label">connects</text>
|
||||||
|
|
||||||
|
<!-- Fleet -->
|
||||||
|
<g transform="translate(240, 250)">
|
||||||
|
<rect width="120" height="50" class="node-rect" fill="#e0f2fe" stroke="#38bdf8" />
|
||||||
|
<text x="60" y="22" class="node-text-title">Fleet</text>
|
||||||
|
<text x="60" y="38" class="node-text-sub">Cluster</text>
|
||||||
|
</g>
|
||||||
|
|
||||||
|
<!-- GridProfile -->
|
||||||
|
<g transform="translate(80, 360)">
|
||||||
|
<rect width="120" height="50" class="node-rect" fill="#dcfce7" stroke="#4ade80" />
|
||||||
|
<text x="60" y="22" class="node-text-title">GridProfile</text>
|
||||||
|
<text x="60" y="38" class="node-text-sub">Regional Power</text>
|
||||||
|
</g>
|
||||||
|
|
||||||
|
<!-- Edge: Fleet to Datacenter -->
|
||||||
|
<path d="M 300 300 L 300 350" class="edge-line" marker-end="url(#arrowhead)" />
|
||||||
|
<rect x="260" y="317" width="80" height="16" fill="white" />
|
||||||
|
<text x="300" y="329" class="edge-label">is hosted in</text>
|
||||||
|
|
||||||
|
<!-- Edge: GridProfile to Datacenter -->
|
||||||
|
<path d="M 200 385 L 230 385" class="edge-line" marker-end="url(#arrowhead)" />
|
||||||
|
<rect x="195" y="377" width="50" height="16" fill="white" />
|
||||||
|
<text x="215" y="389" class="edge-label">powers</text>
|
||||||
|
|
||||||
|
<!-- Datacenter -->
|
||||||
|
<g transform="translate(240, 360)">
|
||||||
|
<rect width="120" height="50" class="node-rect" fill="#f8fafc" stroke="#cbd5e1" />
|
||||||
|
<text x="60" y="22" class="node-text-title">Datacenter</text>
|
||||||
|
<text x="60" y="38" class="node-text-sub">Physical Facility</text>
|
||||||
|
</g>
|
||||||
|
|
||||||
|
<!-- Dashed bounding box for Systems -->
|
||||||
|
<rect x="60" y="120" width="320" height="200" fill="none" stroke="#cbd5e1" stroke-width="2" stroke-dasharray="8,4" rx="8" />
|
||||||
|
<rect x="70" y="112" width="135" height="16" fill="white" />
|
||||||
|
<text x="135" y="124" class="node-text-sub" font-weight="bold">Layer D: Systems</text>
|
||||||
|
|
||||||
|
</svg>
|
||||||
|
After Width: | Height: | Size: 4.0 KiB |
@@ -2,7 +2,6 @@
|
|||||||
title: "The Fleet Zoo"
|
title: "The Fleet Zoo"
|
||||||
subtitle: "Vetted System Archetypes and Multi-Node Clusters"
|
subtitle: "Vetted System Archetypes and Multi-Node Clusters"
|
||||||
---
|
---
|
||||||
|
|
||||||
The Fleet Zoo defines the **Structural Context** of ML systems—from single microcontrollers to
|
The Fleet Zoo defines the **Structural Context** of ML systems—from single microcontrollers to
|
||||||
warehouse-scale supercomputers. Fleets combine hardware nodes, network fabric, and a count to
|
warehouse-scale supercomputers. Fleets combine hardware nodes, network fabric, and a count to
|
||||||
form a complete system that the `DistributedSolver` can analyze.
|
form a complete system that the `DistributedSolver` can analyze.
|
||||||
@@ -79,7 +78,7 @@ else:
|
|||||||
|
|
||||||
### Why Fleet Size Matters
|
### Why Fleet Size Matters
|
||||||
|
|
||||||
Distributed training performance is dominated by **communication overhead**. As you add more nodes, each all-reduce synchronization step must transfer gradient data across the fabric. The `DistributedSolver` models this trade-off using the ring all-reduce formula:
|
Distributed training performance is dominated by **communication overhead**. As you add more nodes, each all-reduce synchronization step must transfer gradient data across the fabric. The `DistributedSolver` models this trade-off using the ring all-reduce formula [@dean2012large]:
|
||||||
|
|
||||||
$$T_{\text{dp}} = 2(N-1) \cdot \left(\frac{M/N}{BW} + L\right)$$
|
$$T_{\text{dp}} = 2(N-1) \cdot \left(\frac{M/N}{BW} + L\right)$$
|
||||||
|
|
||||||
|
|||||||
@@ -2,7 +2,6 @@
|
|||||||
title: "The Silicon Zoo"
|
title: "The Silicon Zoo"
|
||||||
subtitle: "Vetted Specifications for AI Accelerators and Edge Devices"
|
subtitle: "Vetted Specifications for AI Accelerators and Edge Devices"
|
||||||
---
|
---
|
||||||
|
|
||||||
The Silicon Zoo is the **Single Source of Truth (SSoT)** for all physical hardware in `mlsysim`.
|
The Silicon Zoo is the **Single Source of Truth (SSoT)** for all physical hardware in `mlsysim`.
|
||||||
Every specification is typed (`pint.Quantity`), provenance-tracked, and validated against official
|
Every specification is typed (`pint.Quantity`), provenance-tracked, and validated against official
|
||||||
datasheets and MLPerf baselines—so you never have to argue about what the A100's bandwidth actually is.
|
datasheets and MLPerf baselines—so you never have to argue about what the A100's bandwidth actually is.
|
||||||
@@ -49,10 +48,10 @@ def print_hardware_table(title, hardware_class):
|
|||||||
print("| Device | Year | Peak Performance | Memory BW | Capacity | TDP |")
|
print("| Device | Year | Peak Performance | Memory BW | Capacity | TDP |")
|
||||||
print("|:---|:---:|:---:|:---:|:---:|:---:|")
|
print("|:---|:---:|:---:|:---:|:---:|:---:|")
|
||||||
|
|
||||||
for attr_name in sorted(dir(hardware_class)):
|
# Use the new Registry .list() method for coherent sorting
|
||||||
if attr_name.startswith("_"): continue
|
items = hardware_class.list(sort_by='release_year', reverse=True)
|
||||||
item = getattr(hardware_class, attr_name)
|
for item in items:
|
||||||
if "HardwareNode" in type(item).__name__:
|
if True: # Registry already filtered for us
|
||||||
flops = auto_scale(item.compute.peak_flops)
|
flops = auto_scale(item.compute.peak_flops)
|
||||||
bw = auto_scale(item.memory.bandwidth)
|
bw = auto_scale(item.memory.bandwidth)
|
||||||
cap = auto_scale(item.memory.capacity)
|
cap = auto_scale(item.memory.capacity)
|
||||||
@@ -101,4 +100,12 @@ These specifications are used throughout Volumes 1 and 2 of the textbook. The *H
|
|||||||
|
|
||||||
---
|
---
|
||||||
|
|
||||||
|
|
||||||
|
::: {.callout-note}
|
||||||
|
## Missing a device?
|
||||||
|
You can define custom hardware specs on-the-fly in Python or contribute new vetted specs to the registry.
|
||||||
|
See the [Contributing Guide](../contributing.qmd) for how to add persistent specs, or the
|
||||||
|
[Hardware API Reference](../api/hardware.qmd) for defining custom objects.
|
||||||
|
:::
|
||||||
|
|
||||||
*Note: For full technical specs and validation details, see the API Reference.*
|
*Note: For full technical specs and validation details, see the API Reference.*
|
||||||
|
|||||||
@@ -2,7 +2,6 @@
|
|||||||
title: "The MLSys Zoo"
|
title: "The MLSys Zoo"
|
||||||
subtitle: "A Single Source of Truth for ML Systems Specifications"
|
subtitle: "A Single Source of Truth for ML Systems Specifications"
|
||||||
---
|
---
|
||||||
|
|
||||||
The MLSys Zoo is a centralized, vetted registry of specifications used throughout
|
The MLSys Zoo is a centralized, vetted registry of specifications used throughout
|
||||||
the `mlsysim` platform. Every entry is strictly typed with `pint.Quantity` for
|
the `mlsysim` platform. Every entry is strictly typed with `pint.Quantity` for
|
||||||
dimensional correctness, provenance-tracked, and validated against official sources.
|
dimensional correctness, provenance-tracked, and validated against official sources.
|
||||||
@@ -29,23 +28,19 @@ to every solver and tutorial.
|
|||||||
|
|
||||||
---
|
---
|
||||||
|
|
||||||
## Understanding the 5-Layer Stack
|
## System Composition Hierarchy
|
||||||
|
|
||||||
The Zoo catalogs map onto the five analytical layers of MLSYSIM:
|
ML systems are structurally composed of smaller parts. The `mlsysim` registry reflects this physical reality. Before a workload can be evaluated, the structural components are combined into a coherent system.
|
||||||
|
|
||||||
```
|
Here is how the components in the Zoo relate to each other:
|
||||||
[Workloads] ← Model Zoo (what the algorithm demands)
|
|
||||||
↓
|
|
||||||
[Hardware] ← Silicon Zoo (what the chip supplies)
|
|
||||||
↓
|
|
||||||
[Infrastructure] ← Infrastructure Zoo (the environment it runs in)
|
|
||||||
↓
|
|
||||||
[Systems] ← Fleet Zoo (the structural arrangement)
|
|
||||||
↓
|
|
||||||
[Solvers] ← Engine (lowers demand onto supply, produces profile)
|
|
||||||
```
|
|
||||||
|
|
||||||
Each Zoo catalog is the authoritative input to one layer of the progressive lowering stack.
|
{fig-align="center" width="100%"}
|
||||||
|
|
||||||
|
1. **HardwareNode (Silicon):** The fundamental unit of compute (e.g., an H100 GPU or a DGX Spark GB10 superchip). It provides FLOPs and Memory Bandwidth.
|
||||||
|
2. **Node:** A single server chassis. It contains one or more `HardwareNode`s connected by a high-speed intra-node bus (like NVLink).
|
||||||
|
3. **NetworkFabric:** The inter-node networking (e.g., InfiniBand NDR or 100GbE) that allows servers to communicate.
|
||||||
|
4. **Fleet (Cluster):** A collection of `Node`s connected by a `NetworkFabric`. This is the top-level entity used for distributed training and cluster reliability models.
|
||||||
|
5. **Datacenter & GridProfile (Infra/Regions):** The physical facility and regional power grid that hosts the `Fleet`. It dictates the Power Usage Effectiveness (PUE) and the carbon intensity of the electricity consumed.
|
||||||
|
|
||||||
---
|
---
|
||||||
|
|
||||||
|
|||||||
@@ -2,7 +2,6 @@
|
|||||||
title: "The Infrastructure Zoo"
|
title: "The Infrastructure Zoo"
|
||||||
subtitle: "Regional Grids and Sustainability Baselines"
|
subtitle: "Regional Grids and Sustainability Baselines"
|
||||||
---
|
---
|
||||||
|
|
||||||
The Infrastructure Zoo provides the **Environmental Context** for ML deployments—the carbon intensity
|
The Infrastructure Zoo provides the **Environmental Context** for ML deployments—the carbon intensity
|
||||||
of regional electricity grids and datacenter efficiency profiles. Every value is sourced from
|
of regional electricity grids and datacenter efficiency profiles. Every value is sourced from
|
||||||
published government energy data and IEA reporting.
|
published government energy data and IEA reporting.
|
||||||
|
|||||||
@@ -2,7 +2,6 @@
|
|||||||
title: "The Model Zoo"
|
title: "The Model Zoo"
|
||||||
subtitle: "Reference Workloads for Systems Modeling"
|
subtitle: "Reference Workloads for Systems Modeling"
|
||||||
---
|
---
|
||||||
|
|
||||||
The Model Zoo defines the **Computational Demand** placed on the hardware. Every workload is
|
The Model Zoo defines the **Computational Demand** placed on the hardware. Every workload is
|
||||||
pulled from the `mlsysim.Models` registry and characterized by its FLOPs, parameter count, and
|
pulled from the `mlsysim.Models` registry and characterized by its FLOPs, parameter count, and
|
||||||
architecture type—independent of any specific hardware.
|
architecture type—independent of any specific hardware.
|
||||||
@@ -97,4 +96,12 @@ The *Model Training* and *Model Serving* chapters use these workload profiles to
|
|||||||
|
|
||||||
---
|
---
|
||||||
|
|
||||||
*Note: For dynamic memory footprint and KV-cache calculations, see the API Reference.*
|
*
|
||||||
|
::: {.callout-note}
|
||||||
|
## Add your own model
|
||||||
|
Defining custom workloads is straightforward. You can extend the registry or define a
|
||||||
|
(or ) object directly in your code.
|
||||||
|
Learn more in the [Contributing Guide](../contributing.qmd) and the [Models API Reference](../api/models.qmd).
|
||||||
|
:::
|
||||||
|
|
||||||
|
Note: For dynamic memory footprint and KV-cache calculations, see the API Reference.*
|
||||||
|
|||||||
@@ -1,4 +1,5 @@
|
|||||||
from .types import HardwareNode, ComputeCore, MemoryHierarchy
|
from .types import HardwareNode, ComputeCore, MemoryHierarchy
|
||||||
|
from ..core.registry import Registry
|
||||||
from ..core.constants import (
|
from ..core.constants import (
|
||||||
ureg,
|
ureg,
|
||||||
V100_MEM_BW, V100_FLOPS_FP16_TENSOR, V100_MEM_CAPACITY, V100_TDP, V100_FLOPS_FP32,
|
V100_MEM_BW, V100_FLOPS_FP16_TENSOR, V100_MEM_CAPACITY, V100_TDP, V100_FLOPS_FP32,
|
||||||
@@ -10,7 +11,7 @@ from ..core.constants import (
|
|||||||
T4_MEM_BW, T4_FLOPS_FP16_TENSOR, T4_TDP, T4_FLOPS_INT8
|
T4_MEM_BW, T4_FLOPS_FP16_TENSOR, T4_TDP, T4_FLOPS_INT8
|
||||||
)
|
)
|
||||||
|
|
||||||
class CloudHardware:
|
class CloudHardware(Registry):
|
||||||
"""Datacenter-scale accelerators (Volume II)."""
|
"""Datacenter-scale accelerators (Volume II)."""
|
||||||
V100 = HardwareNode(
|
V100 = HardwareNode(
|
||||||
name="NVIDIA V100",
|
name="NVIDIA V100",
|
||||||
@@ -86,8 +87,20 @@ class CloudHardware:
|
|||||||
dispatch_tax=0.03 * ureg.ms
|
dispatch_tax=0.03 * ureg.ms
|
||||||
)
|
)
|
||||||
|
|
||||||
class WorkstationHardware:
|
class WorkstationHardware(Registry):
|
||||||
"""Personal computing systems used for local development."""
|
"""Personal computing systems used for local development."""
|
||||||
|
DGX_Spark = HardwareNode(
|
||||||
|
name="NVIDIA DGX Spark (GB10)",
|
||||||
|
release_year=2024,
|
||||||
|
compute=ComputeCore(
|
||||||
|
peak_flops=250 * ureg.TFLOPs/ureg.s,
|
||||||
|
precision_flops={"fp8": 500 * ureg.TFLOPs/ureg.s, "fp4": 1000 * ureg.TFLOPs/ureg.s}
|
||||||
|
),
|
||||||
|
memory=MemoryHierarchy(capacity=128 * ureg.GB, bandwidth=500 * ureg.GB/ureg.s),
|
||||||
|
tdp=250 * ureg.W,
|
||||||
|
dispatch_tax=0.01 * ureg.ms
|
||||||
|
)
|
||||||
|
|
||||||
MacBookM3Max = HardwareNode(
|
MacBookM3Max = HardwareNode(
|
||||||
name="MacBook Pro (M3 Max)",
|
name="MacBook Pro (M3 Max)",
|
||||||
release_year=2023,
|
release_year=2023,
|
||||||
@@ -97,7 +110,7 @@ class WorkstationHardware:
|
|||||||
dispatch_tax=0.05 * ureg.ms
|
dispatch_tax=0.05 * ureg.ms
|
||||||
)
|
)
|
||||||
|
|
||||||
class MobileHardware:
|
class MobileHardware(Registry):
|
||||||
"""Smartphone and handheld devices (Volume I)."""
|
"""Smartphone and handheld devices (Volume I)."""
|
||||||
iPhone15Pro = HardwareNode(
|
iPhone15Pro = HardwareNode(
|
||||||
name="iPhone 15 Pro (A17 Pro)",
|
name="iPhone 15 Pro (A17 Pro)",
|
||||||
@@ -127,7 +140,7 @@ class MobileHardware:
|
|||||||
dispatch_tax=1.5 * ureg.ms
|
dispatch_tax=1.5 * ureg.ms
|
||||||
)
|
)
|
||||||
|
|
||||||
class EdgeHardware:
|
class EdgeHardware(Registry):
|
||||||
"""Robotics and Industrial Edge (Volume I)."""
|
"""Robotics and Industrial Edge (Volume I)."""
|
||||||
JetsonOrinNX = HardwareNode(
|
JetsonOrinNX = HardwareNode(
|
||||||
name="NVIDIA Jetson Orin NX",
|
name="NVIDIA Jetson Orin NX",
|
||||||
@@ -165,7 +178,7 @@ class EdgeHardware:
|
|||||||
dispatch_tax=0.1 * ureg.ms
|
dispatch_tax=0.1 * ureg.ms
|
||||||
)
|
)
|
||||||
|
|
||||||
class TinyHardware:
|
class TinyHardware(Registry):
|
||||||
"""Microcontrollers and sub-watt devices."""
|
"""Microcontrollers and sub-watt devices."""
|
||||||
ESP32_S3 = HardwareNode(
|
ESP32_S3 = HardwareNode(
|
||||||
name="ESP32-S3 (AI)",
|
name="ESP32-S3 (AI)",
|
||||||
@@ -186,7 +199,7 @@ class TinyHardware:
|
|||||||
dispatch_tax=2.0 * ureg.ms
|
dispatch_tax=2.0 * ureg.ms
|
||||||
)
|
)
|
||||||
|
|
||||||
class Hardware:
|
class Hardware(Registry):
|
||||||
Cloud = CloudHardware
|
Cloud = CloudHardware
|
||||||
Workstation = WorkstationHardware
|
Workstation = WorkstationHardware
|
||||||
Mobile = MobileHardware
|
Mobile = MobileHardware
|
||||||
@@ -203,6 +216,9 @@ class Hardware:
|
|||||||
TPUv5p = CloudHardware.TPUv5p
|
TPUv5p = CloudHardware.TPUv5p
|
||||||
T4 = CloudHardware.T4
|
T4 = CloudHardware.T4
|
||||||
|
|
||||||
|
DGXSpark = WorkstationHardware.DGX_Spark
|
||||||
|
MacBook = WorkstationHardware.MacBookM3Max
|
||||||
|
|
||||||
iPhone = MobileHardware.iPhone15Pro
|
iPhone = MobileHardware.iPhone15Pro
|
||||||
Snapdragon = MobileHardware.Snapdragon8Gen3
|
Snapdragon = MobileHardware.Snapdragon8Gen3
|
||||||
Jetson = EdgeHardware.JetsonOrinNX
|
Jetson = EdgeHardware.JetsonOrinNX
|
||||||
|
|||||||
@@ -1,4 +1,6 @@
|
|||||||
from .types import TransformerWorkload, CNNWorkload, Workload
|
from .types import TransformerWorkload, CNNWorkload, Workload
|
||||||
|
from ..core.registry import Registry
|
||||||
|
from .types import TransformerWorkload, CNNWorkload, Workload
|
||||||
from ..core.constants import (
|
from ..core.constants import (
|
||||||
ureg,
|
ureg,
|
||||||
GPT2_PARAMS, GPT3_PARAMS, GPT4_EST_PARAMS, GPT3_TRAINING_OPS,
|
GPT2_PARAMS, GPT3_PARAMS, GPT4_EST_PARAMS, GPT3_TRAINING_OPS,
|
||||||
@@ -9,7 +11,7 @@ from ..core.constants import (
|
|||||||
ALEXNET_PARAMS, ANOMALY_MODEL_PARAMS, DLRM_MODEL_SIZE_FP32
|
ALEXNET_PARAMS, ANOMALY_MODEL_PARAMS, DLRM_MODEL_SIZE_FP32
|
||||||
)
|
)
|
||||||
|
|
||||||
class LanguageModels:
|
class LanguageModels(Registry):
|
||||||
GPT2 = TransformerWorkload(
|
GPT2 = TransformerWorkload(
|
||||||
name="GPT-2 (1.5B)",
|
name="GPT-2 (1.5B)",
|
||||||
architecture="Transformer",
|
architecture="Transformer",
|
||||||
@@ -77,7 +79,7 @@ class LanguageModels:
|
|||||||
inference_flops=2 * LLAMA3_70B_PARAMS.magnitude * ureg.flop
|
inference_flops=2 * LLAMA3_70B_PARAMS.magnitude * ureg.flop
|
||||||
)
|
)
|
||||||
|
|
||||||
class VisionModels:
|
class VisionModels(Registry):
|
||||||
ResNet50 = CNNWorkload(
|
ResNet50 = CNNWorkload(
|
||||||
name="ResNet-50",
|
name="ResNet-50",
|
||||||
architecture="CNN",
|
architecture="CNN",
|
||||||
@@ -107,7 +109,7 @@ class VisionModels:
|
|||||||
layers=8
|
layers=8
|
||||||
)
|
)
|
||||||
|
|
||||||
class TinyModels:
|
class TinyModels(Registry):
|
||||||
DS_CNN = CNNWorkload(
|
DS_CNN = CNNWorkload(
|
||||||
name="DS-CNN (KWS)",
|
name="DS-CNN (KWS)",
|
||||||
architecture="CNN",
|
architecture="CNN",
|
||||||
@@ -126,7 +128,7 @@ class TinyModels:
|
|||||||
# Generic Workload doesn't have params in type, but we can override
|
# Generic Workload doesn't have params in type, but we can override
|
||||||
)
|
)
|
||||||
|
|
||||||
class RecommendationModels:
|
class RecommendationModels(Registry):
|
||||||
# Special class for DLRM as it's defined by size
|
# Special class for DLRM as it's defined by size
|
||||||
DLRM = Workload(
|
DLRM = Workload(
|
||||||
name="DLRM",
|
name="DLRM",
|
||||||
@@ -136,7 +138,7 @@ class RecommendationModels:
|
|||||||
# Note: We'll add specialized size methods if needed,
|
# Note: We'll add specialized size methods if needed,
|
||||||
# but for now we maintain string compatibility.
|
# but for now we maintain string compatibility.
|
||||||
|
|
||||||
class Models:
|
class Models(Registry):
|
||||||
Language = LanguageModels
|
Language = LanguageModels
|
||||||
Vision = VisionModels
|
Vision = VisionModels
|
||||||
Tiny = TinyModels
|
Tiny = TinyModels
|
||||||
|
|||||||
@@ -127,6 +127,25 @@ class TransformerWorkload(Workload):
|
|||||||
layers=self.layers
|
layers=self.layers
|
||||||
)
|
)
|
||||||
|
|
||||||
|
class SparseTransformerWorkload(TransformerWorkload):
|
||||||
|
active_parameters: Quantity
|
||||||
|
experts: int
|
||||||
|
active_experts_per_token: int = 1
|
||||||
|
|
||||||
|
def lower(self, precision: Quantity = BYTES_FP16) -> ComputationGraph:
|
||||||
|
# For MoE, total parameters define the memory footprint,
|
||||||
|
# but active parameters define the computation flops.
|
||||||
|
ops = self.inference_flops or (2 * self.active_parameters.to(ureg.count).magnitude * ureg.flop)
|
||||||
|
weights = self.size_in_bytes(precision) # uses self.parameters (total params)
|
||||||
|
return ComputationGraph(
|
||||||
|
name=self.name,
|
||||||
|
total_ops=ops,
|
||||||
|
parameter_count=self.parameters,
|
||||||
|
weight_bytes=weights,
|
||||||
|
arithmetic_intensity=(ops / weights).to("flop/byte"),
|
||||||
|
layers=self.layers
|
||||||
|
)
|
||||||
|
|
||||||
class CNNWorkload(Workload):
|
class CNNWorkload(Workload):
|
||||||
parameters: Quantity
|
parameters: Quantity
|
||||||
inference_flops: Quantity
|
inference_flops: Quantity
|
||||||
|
|||||||
@@ -56,38 +56,178 @@ def setup_plot(figsize=(8, 5)):
|
|||||||
|
|
||||||
def plot_roofline(hardware_node, workloads=None):
|
def plot_roofline(hardware_node, workloads=None):
|
||||||
"""
|
"""
|
||||||
Plots a standard Roofline Model for a given HardwareNode.
|
Plots a publication-quality Roofline Model for a given HardwareNode.
|
||||||
Follows the LEGO-style visualization pattern.
|
|
||||||
|
Features:
|
||||||
|
- Ridge point annotated with numeric value
|
||||||
|
- Memory-bound and compute-bound regions shaded and labeled
|
||||||
|
- Memory bandwidth ceiling (diagonal) and compute ceiling (flat)
|
||||||
|
- Workloads plotted with bottleneck classification
|
||||||
"""
|
"""
|
||||||
# 1. PARAMETERS
|
# 1. PARAMETERS
|
||||||
peak_flops = hardware_node.compute.peak_flops.to('TFLOPs/s').magnitude
|
peak_flops = hardware_node.compute.peak_flops.to("TFLOPs/s").magnitude
|
||||||
peak_bw = hardware_node.memory.bandwidth.to('GB/s').magnitude
|
peak_bw = hardware_node.memory.bandwidth.to("GB/s").magnitude
|
||||||
|
ridge_point = peak_flops / (peak_bw / 1000) # FLOP/Byte
|
||||||
# 2. INVARIANTS
|
|
||||||
x_intensities = np.logspace(-1, 4, 100)
|
# 2. AXIS RANGE
|
||||||
|
x_min, x_max = 0.1, 10000
|
||||||
# 3. CALCULATION
|
x = np.logspace(np.log10(x_min), np.log10(x_max), 500)
|
||||||
y_memory_bound = peak_bw * x_intensities / 1000 # TFLOPs equivalent
|
|
||||||
y_compute_bound = np.full_like(x_intensities, peak_flops)
|
# 3. ROOFLINE CURVES
|
||||||
y_roofline = np.minimum(y_memory_bound, y_compute_bound)
|
y_mem = peak_bw * x / 1000 # BW * AI, converted to TFLOP/s
|
||||||
|
y_compute = np.full_like(x, peak_flops)
|
||||||
# 4. OUTPUT (Visualization)
|
y_roof = np.minimum(y_mem, y_compute)
|
||||||
fig, ax, colors, plt = setup_plot()
|
|
||||||
ax.loglog(x_intensities, y_roofline, color=colors['BlueLine'], linewidth=2.5, label=f'{hardware_node.name} Roofline')
|
# 4. PLOT
|
||||||
ax.fill_between(x_intensities, 0, y_roofline, color=colors['BlueFill'], alpha=0.3)
|
fig, ax, colors, _ = setup_plot(figsize=(9, 5.5))
|
||||||
|
|
||||||
|
# Shaded regions
|
||||||
|
mem_mask = x <= ridge_point
|
||||||
|
comp_mask = x >= ridge_point
|
||||||
|
ax.fill_between(
|
||||||
|
x[mem_mask],
|
||||||
|
y_roof[mem_mask] * 0.001,
|
||||||
|
y_roof[mem_mask],
|
||||||
|
color=colors["OrangeL"],
|
||||||
|
alpha=0.5,
|
||||||
|
label="Memory-bound region",
|
||||||
|
)
|
||||||
|
ax.fill_between(
|
||||||
|
x[comp_mask],
|
||||||
|
y_roof[comp_mask] * 0.001,
|
||||||
|
y_roof[comp_mask],
|
||||||
|
color=colors["BlueFill"],
|
||||||
|
alpha=0.5,
|
||||||
|
label="Compute-bound region",
|
||||||
|
)
|
||||||
|
|
||||||
|
# Roofline line
|
||||||
|
ax.loglog(
|
||||||
|
x,
|
||||||
|
y_roof,
|
||||||
|
color=colors["BlueLine"],
|
||||||
|
linewidth=2.5,
|
||||||
|
zorder=5,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Memory bandwidth ceiling label (on the slope)
|
||||||
|
slope_x = ridge_point * 0.08
|
||||||
|
slope_y = peak_bw * slope_x / 1000
|
||||||
|
ax.text(
|
||||||
|
slope_x,
|
||||||
|
slope_y * 1.6,
|
||||||
|
f"BW ceiling: {peak_bw:.0f} GB/s",
|
||||||
|
color=colors["OrangeLine"],
|
||||||
|
fontsize=8.5,
|
||||||
|
fontweight="bold",
|
||||||
|
rotation=38,
|
||||||
|
ha="center",
|
||||||
|
va="bottom",
|
||||||
|
)
|
||||||
|
|
||||||
|
# Compute ceiling label (on the flat)
|
||||||
|
ax.text(
|
||||||
|
ridge_point * 8,
|
||||||
|
peak_flops * 1.12,
|
||||||
|
f"Compute ceiling: {peak_flops:.0f} TFLOP/s",
|
||||||
|
color=colors["BlueLine"],
|
||||||
|
fontsize=8.5,
|
||||||
|
fontweight="bold",
|
||||||
|
ha="center",
|
||||||
|
va="bottom",
|
||||||
|
)
|
||||||
|
|
||||||
|
# Ridge point
|
||||||
|
ax.plot(
|
||||||
|
ridge_point,
|
||||||
|
peak_flops,
|
||||||
|
"D",
|
||||||
|
color=colors["crimson"],
|
||||||
|
markersize=9,
|
||||||
|
zorder=10,
|
||||||
|
)
|
||||||
|
ax.annotate(
|
||||||
|
f"Ridge Point\n{ridge_point:.1f} FLOP/Byte",
|
||||||
|
xy=(ridge_point, peak_flops),
|
||||||
|
xytext=(ridge_point * 3, peak_flops * 0.35),
|
||||||
|
fontsize=8.5,
|
||||||
|
fontweight="bold",
|
||||||
|
color=colors["crimson"],
|
||||||
|
ha="center",
|
||||||
|
arrowprops=dict(
|
||||||
|
arrowstyle="->",
|
||||||
|
color=colors["crimson"],
|
||||||
|
lw=1.2,
|
||||||
|
),
|
||||||
|
)
|
||||||
|
|
||||||
|
# Vertical dashed line at ridge point
|
||||||
|
ax.axvline(
|
||||||
|
ridge_point,
|
||||||
|
color=colors["crimson"],
|
||||||
|
linestyle=":",
|
||||||
|
linewidth=0.8,
|
||||||
|
alpha=0.5,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Region labels
|
||||||
|
ax.text(
|
||||||
|
x_min * 1.5,
|
||||||
|
peak_flops * 0.6,
|
||||||
|
"MEMORY\nBOUND",
|
||||||
|
color=colors["OrangeLine"],
|
||||||
|
fontsize=11,
|
||||||
|
fontweight="bold",
|
||||||
|
alpha=0.25,
|
||||||
|
ha="left",
|
||||||
|
va="center",
|
||||||
|
)
|
||||||
|
ax.text(
|
||||||
|
x_max * 0.4,
|
||||||
|
peak_flops * 0.6,
|
||||||
|
"COMPUTE\nBOUND",
|
||||||
|
color=colors["BlueLine"],
|
||||||
|
fontsize=11,
|
||||||
|
fontweight="bold",
|
||||||
|
alpha=0.25,
|
||||||
|
ha="right",
|
||||||
|
va="center",
|
||||||
|
)
|
||||||
|
|
||||||
|
# Plot workloads
|
||||||
if workloads:
|
if workloads:
|
||||||
from ..core.engine import Engine
|
from ..core.engine import Engine
|
||||||
for model in workloads:
|
|
||||||
profile = Engine.solve(model, hardware_node, efficiency=1.0)
|
|
||||||
intensity = profile.arithmetic_intensity.magnitude
|
|
||||||
theoretical_perf = min(peak_bw * intensity / 1000, peak_flops)
|
|
||||||
ax.plot(intensity, theoretical_perf, 'o', color=colors['crimson'], markersize=8)
|
|
||||||
ax.text(intensity * 1.2, theoretical_perf, model.name, color=colors['crimson'], fontsize=9, fontweight='bold')
|
|
||||||
|
|
||||||
ax.set_xlabel('Arithmetic Intensity (FLOP/Byte)')
|
workload_colors = [
|
||||||
ax.set_ylabel('Performance (TFLOPs/s)')
|
colors["crimson"],
|
||||||
ax.set_title(f'Roofline: {hardware_node.name}')
|
colors["GreenLine"],
|
||||||
|
colors["VioletLine"],
|
||||||
|
colors["BrownLine"],
|
||||||
|
]
|
||||||
|
for i, model in enumerate(workloads):
|
||||||
|
profile = Engine.solve(model, hardware_node, efficiency=1.0)
|
||||||
|
ai = profile.arithmetic_intensity.magnitude
|
||||||
|
perf = min(peak_bw * ai / 1000, peak_flops)
|
||||||
|
c = workload_colors[i % len(workload_colors)]
|
||||||
|
bound = "memory" if ai < ridge_point else "compute"
|
||||||
|
ax.plot(ai, perf, "o", color=c, markersize=9, zorder=10)
|
||||||
|
ax.annotate(
|
||||||
|
f"{model.name}\n({bound}-bound)",
|
||||||
|
xy=(ai, perf),
|
||||||
|
xytext=(ai * 0.3, perf * 0.4),
|
||||||
|
fontsize=8,
|
||||||
|
fontweight="bold",
|
||||||
|
color=c,
|
||||||
|
ha="center",
|
||||||
|
arrowprops=dict(arrowstyle="->", color=c, lw=1),
|
||||||
|
)
|
||||||
|
|
||||||
|
ax.set_xlabel("Arithmetic Intensity (FLOP/Byte)")
|
||||||
|
ax.set_ylabel("Performance (TFLOP/s)")
|
||||||
|
ax.set_title(f"Roofline: {hardware_node.name}")
|
||||||
|
ax.set_xlim(x_min, x_max)
|
||||||
|
ax.set_ylim(peak_flops * 0.001, peak_flops * 2)
|
||||||
|
ax.legend(loc="lower right", fontsize=8, framealpha=0.9)
|
||||||
return fig, ax
|
return fig, ax
|
||||||
|
|
||||||
def plot_evaluation_scorecard(evaluation):
|
def plot_evaluation_scorecard(evaluation):
|
||||||
|
|||||||
Reference in New Issue
Block a user