Files
cs249r_book/mlsysim/systems/registry.py
Vijay Janapa Reddi a78f1bd8b0 feat(mlsysim): add documentation site, typed registries, and 6-solver core
Complete MLSYSIM v0.1.0 implementation with:

- Documentation website (Quarto): landing page with animated hero
  and capability carousel, 4 tutorials (hello world, LLM serving,
  distributed training, sustainability), hardware/model/fleet/infra
  catalogs, solver guide, whitepaper, math foundations, glossary,
  and full quartodoc API reference
- Typed registry system: Hardware (18 devices across 5 tiers),
  Models (15 workloads), Systems (fleets, clusters, fabrics),
  Infrastructure (grid profiles, rack configs, datacenters)
- Core types: Pint-backed Quantity, Metadata provenance tracking,
  custom exception hierarchy (OOMError, SLAViolation)
- SimulationConfig with YAML/JSON loading and pre-validation
- Scenario system tying workloads to systems with SLA constraints
- Multi-level evaluation scorecard (feasibility, performance, macro)
- Examples, tests, and Jetson Orin NX spec fix (100 → 25 TFLOP/s)

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-07 15:59:51 -05:00

80 lines
2.2 KiB
Python

from .types import DeploymentTier, Node, Fleet, NetworkFabric
from ..core.constants import (
ureg,
SMARTPHONE_RAM_GB, MCU_RAM_KIB, CLOUD_MEM_GIB, TINY_MEM_KIB,
INFINIBAND_NDR_BW, INFINIBAND_HDR_BW, NETWORK_10G_BW, NETWORK_100G_BW
)
from ..hardware.registry import Hardware
class Tiers:
"""Vetted Deployment Tiers."""
Cloud = DeploymentTier(
name="Cloud",
ram=512 * ureg.GB,
storage=10 * ureg.TB,
typical_latency_budget=200 * ureg.ms
)
Edge = DeploymentTier(
name="Edge",
ram=32 * ureg.GB,
storage=1 * ureg.TB,
typical_latency_budget=50 * ureg.ms
)
Mobile = DeploymentTier(
name="Mobile",
ram=SMARTPHONE_RAM_GB,
storage=256 * ureg.GB,
typical_latency_budget=30 * ureg.ms
)
Tiny = DeploymentTier(
name="TinyML",
ram=MCU_RAM_KIB,
storage=4 * ureg.MB,
typical_latency_budget=100 * ureg.ms
)
class Nodes:
"""Vetted Reference Nodes."""
DGX_H100 = Node(
name="DGX H100",
accelerator=Hardware.H100,
accelerators_per_node=8,
intra_node_bw=900 * ureg.GB / ureg.second,
nics_per_node=8
)
DGX_A100 = Node(
name="DGX A100",
accelerator=Hardware.A100,
accelerators_per_node=8,
intra_node_bw=600 * ureg.GB / ureg.second,
nics_per_node=8
)
class Fabrics:
"""Vetted Network Fabrics."""
Ethernet_10G = NetworkFabric(name="10GbE", bandwidth=NETWORK_10G_BW)
Ethernet_100G = NetworkFabric(name="100GbE", bandwidth=NETWORK_100G_BW)
InfiniBand_HDR = NetworkFabric(name="IB HDR", bandwidth=INFINIBAND_HDR_BW)
InfiniBand_NDR = NetworkFabric(name="IB NDR", bandwidth=INFINIBAND_NDR_BW)
class Clusters:
"""Vetted Production Clusters."""
Research_256 = Fleet(
name="Research Cluster (256 GPUs)",
node=Nodes.DGX_H100,
count=32, # 32 nodes * 8 GPUs = 256
fabric=Fabrics.Ethernet_100G
)
Frontier_8K = Fleet(
name="Frontier Cluster (8192 GPUs)",
node=Nodes.DGX_H100,
count=1024, # 1024 nodes * 8 GPUs = 8192
fabric=Fabrics.InfiniBand_NDR
)
class Systems:
Tiers = Tiers
Nodes = Nodes
Clusters = Clusters
Fabrics = Fabrics