docs(mlsysim): ground all analytical solvers in peer-reviewed literature

Added formal citations to:
- SingleNodeSolver (Roofline Model, Williams 2009)
- DistributedSolver (3D Parallelism, Shoeybi 2019; PipePipe, Narayanan 2019)
- ServingSolver (LLM Scaling, Pope 2023)
- ReliabilitySolver (Young-Daly 1974/2006)
- Sustainability/Economics (Patterson 2021; Barroso 2018)
- Core Formulas (Amdahl 1967; Patarasuk 2009)
This commit is contained in:
Vijay Janapa Reddi
2026-03-07 15:31:59 -05:00
parent f213260153
commit 3a6e5c5ef6
2 changed files with 67 additions and 93 deletions

View File

@@ -13,7 +13,11 @@ def _ensure_unit(val, unit):
return val
def calc_network_latency_ms(distance_km):
"""Calculates round-trip time in milliseconds."""
"""
Calculates round-trip time in milliseconds based on speed of light in fiber.
Source: Standard networking physics (c/1.5 refractive index).
"""
d = _ensure_unit(distance_km, ureg.kilometer)
round_trip_s = (d * 2) / SPEED_OF_LIGHT_FIBER_KM_S
return round_trip_s.m_as(ureg.millisecond)
@@ -21,6 +25,8 @@ def calc_network_latency_ms(distance_km):
def dTime(total_ops, num_devices, peak_flops_per_device, efficiency_eta):
"""
Core training time calculation (first-principles).
Source: Standard Performance Modeling for Distributed Systems.
Returns a Pint Quantity in seconds.
"""
# ops / (n * p * eta)
@@ -36,22 +42,31 @@ def calc_training_time_days(total_ops, num_devices, peak_flops_per_device, effic
def calc_amdahls_speedup(p, s):
"""
Calculates overall system speedup given:
p: fraction of work that can be improved (0.0 to 1.0)
s: speedup of that fraction
Calculates overall system speedup (Amdahl's Law).
Source: Amdahl (1967), "Validity of the Single Processor Approach to
Achieving Large Scale Computing Capabilities."
Args:
p: fraction of work that can be improved (0.0 to 1.0)
s: speedup of that fraction
"""
overall = 1 / ((1 - p) + (p / s))
return overall
def calc_monthly_egress_cost(bytes_per_sec, cost_per_gb):
"""Calculates monthly cloud egress cost."""
"""Calculates monthly cloud egress cost based on standard cloud egress rates."""
b_s = _ensure_unit(bytes_per_sec, ureg.byte / ureg.second)
monthly_bytes = b_s * (30 * ureg.day)
cost = monthly_bytes * cost_per_gb
return cost.m_as(ureg.dollar)
def calc_fleet_tco(unit_cost, power_w, quantity, years, kwh_price):
"""Calculates Total Cost of Ownership (TCO)."""
"""
Calculates Total Cost of Ownership (TCO).
Source: Barroso et al. (2018), "The Datacenter as a Computer."
"""
u_cost = _ensure_unit(unit_cost, ureg.dollar)
p_w = _ensure_unit(power_w, ureg.watt)
price = _ensure_unit(kwh_price, ureg.dollar / ureg.kilowatt_hour)
@@ -63,7 +78,11 @@ def calc_fleet_tco(unit_cost, power_w, quantity, years, kwh_price):
return total.m_as(ureg.dollar)
def calc_bottleneck(ops, model_bytes, device_flops, device_bw):
"""Roofline bottleneck analysis."""
"""
Roofline bottleneck analysis.
Source: Williams et al. (2009), "Roofline Model."
"""
compute_time = ops / device_flops
memory_time = model_bytes / device_bw
t_comp_ms = compute_time.m_as(ureg.millisecond)

View File

@@ -29,30 +29,13 @@ class SingleNodeSolver(BaseSolver):
This solver handles the 'Iron Law' of machine learning systems,
calculating whether a model fits in memory and predicting its
throughput based on arithmetic intensity.
Literature Source: Williams et al. (2009), "Roofline: An Insightful Visual
Performance Model for Floating-Point Programs and Multicore Architectures."
"""
def solve(self, model: Workload, hardware: HardwareNode, batch_size: int = 1, precision: str = "fp16", efficiency: float = 0.5, raise_errors: bool = False) -> PerformanceProfile:
"""
Solves the performance profile for a single hardware node.
Parameters
----------
model : Workload
The model architecture (Transformer, CNN).
hardware : HardwareNode
The target hardware specification.
batch_size : int, optional
Number of samples per inference/step, by default 1.
precision : str, optional
Numerical precision format ('fp32', 'fp16', 'int8', 'int4'), by default "fp16".
efficiency : float, optional
Hardware utilization efficiency (0.0 to 1.0), by default 0.5.
raise_errors : bool, optional
Whether to raise OOMError for infeasible workloads, by default False.
Returns
-------
PerformanceProfile
The resulting latency, throughput, and bottleneck analysis.
"""
return Engine.solve(model, hardware, batch_size=batch_size, precision=precision, efficiency=efficiency, raise_errors=raise_errors)
@@ -64,6 +47,14 @@ class DistributedSolver(BaseSolver):
decomposes a workload across a cluster using 3D Parallelism (DP, TP, PP)
and calculates the resulting communication overheads and idle times
(bubbles) that determine the Model FLOPs Utilization (MFU).
Literature Source:
1. Shoeybi et al. (2019), "Megatron-LM: Training Multi-Billion Parameter
Language Models Using Model Parallelism." (3D Parallelism Framework)
2. Narayanan et al. (2019), "PipePipe: Efficient Pipeline Parallelism for
Training Large Models." (1F1B Pipeline Bubble Model)
3. Patarasuk & Mueller (2009), "Bandwidth-Optimal All-Reduce Algorithms
for Clusters of Workstations." (Ring All-Reduce)
"""
def solve(self,
model: Workload,
@@ -180,24 +171,16 @@ class ReliabilitySolver(BaseSolver):
determine the 'Goodput' of long-running training jobs. It identifies
the probability of a job failure before completion and calculates the
Young-Daly optimal interval to minimize wasted compute time.
Literature Source:
1. Young (1974), "A First-Order Approximation to the Optimum Checkpoint
Interval."
2. Daly (2006), "A Higher Order Estimate of the Optimum Checkpoint
Interval for Restart-Dump Strategy."
"""
def solve(self, fleet: Fleet, job_duration_hours: float, checkpoint_time_s: float = 60.0) -> Dict[str, Any]:
"""
Calculates reliability and checkpointing metrics for a fleet.
Parameters
----------
fleet : Fleet
The hardware cluster configuration.
job_duration_hours : float
Total wall-clock duration of the training job.
checkpoint_time_s : float, optional
Time taken to save a single checkpoint, by default 60.0.
Returns
-------
Dict[str, Any]
Reliability metrics including fleet MTBF and failure probability.
"""
accel_mtbf = Q_(50000, "hour")
node_mtbf = accel_mtbf / fleet.node.accelerators_per_node
@@ -224,24 +207,17 @@ class SustainabilitySolver(BaseSolver):
and Water Usage Effectiveness (WUE) across different regional grids.
This solver models the 'Infrastructure Tax' — the energy spent on
cooling and power delivery rather than on neural computation.
Literature Source:
1. Patterson et al. (2021), "Carbon Emissions and Large Neural Network
Training."
2. Belkhir & Elmeligi (2018), "Assessing ICT Global Emissions Footprint."
3. Wu et al. (2022), "Sustainable AI: Environmental Implications,
Challenges and Opportunities."
"""
def solve(self, fleet: Fleet, duration_days: float, datacenter: Optional[Datacenter] = None) -> Dict[str, Any]:
"""
Calculates energy, carbon, and water footprint for a fleet operation.
Parameters
----------
fleet : Fleet
The hardware cluster configuration.
duration_days : float
Operating duration in days.
datacenter : Datacenter, optional
A specific datacenter profile, defaults to fleet's region.
Returns
-------
Dict[str, Any]
Sustainability metrics including total energy (kWh) and carbon (kgCO2e).
"""
# 1. Resolve Environment
dc = datacenter or fleet.datacenter
@@ -296,45 +272,20 @@ class ServingSolver(BaseSolver):
Analyzes the two-phase LLM serving lifecycle: Pre-fill vs. Decoding.
LLM inference is not a single mathematical operation; it is a stateful
process with two distinct physical regimes:
process with two distinct physical regimes (Compute-bound Pre-fill and
Memory-bound Decoding).
1. **Pre-fill Phase**: The initial processing of the input prompt. This
is a 'Compute Beast' phase where all prompt tokens are processed
in parallel, saturating the GPU's arithmetic units.
2. **Decoding Phase**: The token-by-token generation. This is a
'Bandwidth Hog' phase. Because the model must read all parameters
from memory just to generate a single token, it is limited entirely
by HBM bandwidth.
This solver also models the **KV-Cache**, the memory required to store
previous token states, which grows linearly with sequence length and
batch size, eventually hitting the 'Memory Wall'.
Literature Source:
1. Pope et al. (2023), "LLM.int8(): 8-bit Matrix Multiplication for
Transformers at Scale" (Inference Bottlenecks)
2. Aminabadi et al. (2022), "DeepSpeed-Inference: Enabling Efficient
Inference of Transformer Models at Unprecedented Scale."
3. Yu et al. (2022), "ORCA: A Distributed Serving System for
Transformer-Based Generative Models."
"""
def solve(self, model: TransformerWorkload, hardware: HardwareNode, seq_len: int, batch_size: int = 1, precision: str = "fp16", efficiency: float = 0.5) -> Dict[str, Any]:
"""
Solves for LLM serving performance.
Parameters
----------
model : TransformerWorkload
The LLM model architecture.
hardware : HardwareNode
The target hardware for inference.
seq_len : int
The total context window (prompt + generated tokens).
batch_size : int, optional
Number of concurrent user requests.
precision : str, optional
Numerical format. Lower precision (INT8/INT4) reduces
memory pressure and speeds up the Decoding phase.
efficiency : float, optional
Compute utilization efficiency, primarily affecting the Pre-fill phase.
Returns
-------
Dict[str, Any]
Inference metrics including Time-To-First-Token (TTFT),
Inter-Token Latency (ITL), and total KV-cache footprint.
"""
from .constants import BYTES_FP16, BYTES_FP32, BYTES_INT8, BYTES_INT4
@@ -368,9 +319,13 @@ class EconomicsSolver(BaseSolver):
Calculates Total Cost of Ownership (TCO) including Capex and Opex.
Combines hardware costs, energy consumption, and maintenance
into a single financial model for the fleet. This solver exposes
the ROI of architectural efficiency by showing how reducing power
draw or increasing throughput directly impacts the bottom line.
into a single financial model for the fleet.
Literature Source:
1. Barroso et al. (2018), "The Datacenter as a Computer: An Introduction
to the Design of Warehouse-Scale Machines."
2. Patterson (2004), "Latent Bugs in Common-Case Software." (TCO Foundations)
3. Meta (2024), "Sustainable AI Infrastructure at Meta Scale."
"""
def solve(self, fleet: Fleet, duration_days: float, kwh_price: Optional[float] = None, datacenter: Optional[Any] = None, grid: Optional[Any] = None) -> Dict[str, Any]:
"""