mirror of
https://github.com/harvard-edge/cs249r_book.git
synced 2026-03-09 07:15:51 -05:00
Complete MLSYSIM v0.1.0 implementation with: - Documentation website (Quarto): landing page with animated hero and capability carousel, 4 tutorials (hello world, LLM serving, distributed training, sustainability), hardware/model/fleet/infra catalogs, solver guide, whitepaper, math foundations, glossary, and full quartodoc API reference - Typed registry system: Hardware (18 devices across 5 tiers), Models (15 workloads), Systems (fleets, clusters, fabrics), Infrastructure (grid profiles, rack configs, datacenters) - Core types: Pint-backed Quantity, Metadata provenance tracking, custom exception hierarchy (OOMError, SLAViolation) - SimulationConfig with YAML/JSON loading and pre-validation - Scenario system tying workloads to systems with SLA constraints - Multi-level evaluation scorecard (feasibility, performance, macro) - Examples, tests, and Jetson Orin NX spec fix (100 → 25 TFLOP/s) Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
42 lines
1.4 KiB
Python
42 lines
1.4 KiB
Python
import pytest
|
|
import mlsysim
|
|
from mlsysim.core.constants import ureg
|
|
|
|
def test_mlperf_resnet_a100():
|
|
"""
|
|
Empirical Anchor: ResNet-50 on NVIDIA A100 (SXM4).
|
|
Reference: MLPerf Inference v4.0, NVIDIA Submission.
|
|
Target: ~37,000 samples/second (Offline scenario).
|
|
"""
|
|
model = mlsysim.Models.Vision.ResNet50
|
|
hardware = mlsysim.Hardware.A100
|
|
|
|
# We use an efficiency factor (eta) to match real-world overheads
|
|
# observed in MLPerf (kernel launch, data loading, etc.)
|
|
# 0.49 is a typical MFU/HFU for ResNet on A100 at scale.
|
|
perf = mlsysim.Engine.solve(model, hardware, batch_size=2048, efficiency=0.49)
|
|
|
|
predicted_throughput = perf.throughput.m_as("1/second")
|
|
|
|
# Target is ~37,000
|
|
assert 35000 <= predicted_throughput <= 40000
|
|
print(f"Predicted: {predicted_throughput:.1f} samples/s | MLPerf Target: ~37,000")
|
|
|
|
def test_llama_inference_h100():
|
|
"""
|
|
Empirical Anchor: Llama-2-70B on NVIDIA H100.
|
|
Reference: NVIDIA/vLLM benchmarks.
|
|
Target ITL: ~40-50ms (Batch 1, FP16).
|
|
"""
|
|
model = mlsysim.Models.Language.Llama2_70B
|
|
hardware = mlsysim.Hardware.H100
|
|
|
|
solver = mlsysim.ServingSolver()
|
|
result = solver.solve(model, hardware, seq_len=2048, batch_size=1, efficiency=1.0)
|
|
|
|
itl = result['itl'].m_as("ms")
|
|
|
|
# ITL = ModelSize / BW = 140GB / 3.35TB/s = ~41.8ms
|
|
assert 40 <= itl <= 45
|
|
print(f"Predicted ITL: {itl:.2f} ms | vLLM Target: ~42ms")
|