Files
cs249r_book/mlperf-edu/examples/practitioner_workflows.py
Vijay Janapa Reddi a9878ad6bd feat: import mlperf-edu pedagogical benchmark suite
Snapshot of the standalone /Users/VJ/GitHub/mlperf-edu/ repo as of
2026-04-16, brought into MLSysBook as a parked feature branch for
backup and iteration. Not for merge to dev.

Contents (88 files, ~2.3 MB):
- 16 reference workloads (cloud / edge / tiny / agent divisions)
- LoadGen proxy harness + SUT plugin protocol
- Compliance checker, autograder, hardware fingerprint
- Paper draft (paper.tex) with TikZ/SVG figure sources
- Three lab examples + practitioner workflow configs
- Workload + dataset YAML registries (single source of truth)

Excluded (per mlperf-edu/.gitignore + size constraints):
- Datasets (6.6 GB), checkpoints (260 MB), gpt2 weights (523 MB)
- Generated PDFs, .venv, build artifacts
2026-04-16 14:15:05 -04:00

319 lines
12 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
# MLPerf EDU: Practitioner Workflow Examples
# ==========================================
# These examples mirror how MLPerf is actually used in industry.
# Each follows: Problem → Initial Run → Modification → New Run → Report → Insight
#
# Usage: Run these scripts end-to-end to produce comparison reports.
"""
Example 1: Training Optimization Workflow
=========================================
Problem: "Reduce NanoGPT training time by 20% without sacrificing quality."
This mirrors what ML engineers do at NVIDIA/Google when optimizing
training throughput for a new hardware platform.
"""
import json
import os
import time
import hashlib
import datetime
def example_training_optimization():
"""Workflow 1: Optimize training throughput.
A practitioner profiles NanoGPT training and applies three optimizations:
1. Increase batch size (8 → 64)
2. Enable mixed precision (FP32 → FP16)
3. Optimize data loading (num_workers)
The report comparison shows the latency-accuracy tradeoff.
"""
print("=" * 60)
print("Example 1: Training Optimization Workflow")
print("=" * 60)
# --- Baseline Run ---
baseline = {
"workload": "nanogpt-12m",
"division": "cloud",
"scenario": "SingleStream",
"timestamp": datetime.datetime.now().isoformat(),
"seed": 42,
"hardware_fingerprint": {
"cpu": "Apple M1",
"gpu": "Apple M1 GPU (MPS)",
"memory_gb": 8,
"os": "macOS 15.4.1"
},
"config": {
"batch_size": 8,
"learning_rate": 0.001,
"optimizer": "AdamW",
"precision": "fp32",
"num_workers": 0
},
"metrics": {
"loss": 2.28,
"latency_p50_ms": 12.4,
"latency_p90_ms": 15.1,
"latency_p99_ms": 18.3,
"throughput_qps": 80.6,
"power_avg_watts": 11.2,
"energy_joules": 1780.0
},
"training": {
"epochs": 25,
"final_train_loss": 2.28,
"final_val_loss": 2.35,
"total_time_s": 178.0,
"curve": [
{"epoch": 1, "train": 4.28, "val": 4.31},
{"epoch": 5, "train": 3.55, "val": 3.61},
{"epoch": 10, "train": 2.92, "val": 3.01},
{"epoch": 15, "train": 2.60, "val": 2.71},
{"epoch": 20, "train": 2.40, "val": 2.49},
{"epoch": 25, "train": 2.28, "val": 2.35}
]
},
"compliance": {"target_met": True, "target": "loss < 2.3", "run_count": 3},
"integrity": {
"dataset_hash": "a3f2b8c9d4e5f6a7",
"checkpoint_hash": "1b2c3d4e5f6a7b8c",
"log_hash": "f0e1d2c3b4a59687"
}
}
# --- Optimized Run (batch_size=64, fp16, num_workers=4) ---
optimized = {
"workload": "nanogpt-12m",
"division": "cloud",
"scenario": "SingleStream",
"timestamp": datetime.datetime.now().isoformat(),
"seed": 42,
"hardware_fingerprint": baseline["hardware_fingerprint"],
"config": {
"batch_size": 64,
"learning_rate": 0.001,
"optimizer": "AdamW",
"precision": "fp16",
"num_workers": 4
},
"metrics": {
"loss": 2.25,
"latency_p50_ms": 5.90,
"latency_p90_ms": 7.20,
"latency_p99_ms": 9.10,
"throughput_qps": 169.5,
"power_avg_watts": 12.3,
"energy_joules": 1094.7
},
"training": {
"epochs": 25,
"final_train_loss": 2.25,
"final_val_loss": 2.31,
"total_time_s": 89.0,
"curve": [
{"epoch": 1, "train": 4.25, "val": 4.28},
{"epoch": 5, "train": 3.45, "val": 3.50},
{"epoch": 10, "train": 2.82, "val": 2.88},
{"epoch": 15, "train": 2.50, "val": 2.57},
{"epoch": 20, "train": 2.32, "val": 2.39},
{"epoch": 25, "train": 2.25, "val": 2.31}
]
},
"compliance": {"target_met": True, "target": "loss < 2.3", "run_count": 3},
"integrity": {
"dataset_hash": "a3f2b8c9d4e5f6a7",
"checkpoint_hash": "9e8d7c6b5a4f3e2d",
"log_hash": "a1b2c3d4e5f6a7b8"
}
}
# Save submissions
os.makedirs("submissions/examples", exist_ok=True)
baseline_path = "submissions/examples/nanogpt_baseline.json"
optimized_path = "submissions/examples/nanogpt_optimized.json"
with open(baseline_path, 'w') as f:
json.dump(baseline, f, indent=2)
with open(optimized_path, 'w') as f:
json.dump(optimized, f, indent=2)
# Generate comparative report
from src.mlperf.report import generate_report
report_path = generate_report(
optimized_path,
output_path="submissions/examples/nanogpt_optimization_report.html",
baseline_path=baseline_path
)
# Print analysis
speedup = baseline["training"]["total_time_s"] / optimized["training"]["total_time_s"]
energy_savings = 1.0 - (optimized["metrics"]["energy_joules"] / baseline["metrics"]["energy_joules"])
throughput_gain = optimized["metrics"]["throughput_qps"] / baseline["metrics"]["throughput_qps"]
print(f"\n--- Analysis ---")
print(f"Training speedup: {speedup:.1f}x ({baseline['training']['total_time_s']}s → {optimized['training']['total_time_s']}s)")
print(f"Throughput gain: {throughput_gain:.1f}x ({baseline['metrics']['throughput_qps']:.0f}{optimized['metrics']['throughput_qps']:.0f} QPS)")
print(f"Energy savings: {energy_savings*100:.0f}% ({baseline['metrics']['energy_joules']:.0f}J → {optimized['metrics']['energy_joules']:.0f}J)")
print(f"Quality preserved: loss {baseline['metrics']['loss']:.2f}{optimized['metrics']['loss']:.2f} (target: <2.3)")
print(f"\nReport: {report_path}")
return report_path
def example_bottleneck_analysis():
"""Workflow 4: System Bottleneck Identification.
A system designer investigates why DLRM training is unexpectedly fast
despite having embedding tables. Discovers the micro-scale table fits
in cache, illustrating how production bottlenecks differ at scale.
"""
print("\n" + "=" * 60)
print("Example 4: System Bottleneck Analysis")
print("=" * 60)
dlrm_run = {
"workload": "micro-dlrm-1m",
"division": "cloud",
"scenario": "Offline",
"timestamp": datetime.datetime.now().isoformat(),
"seed": 42,
"hardware_fingerprint": {
"cpu": "Apple M1",
"gpu": "Apple M1 GPU (MPS)",
"memory_gb": 8,
"os": "macOS 15.4.1"
},
"config": {
"batch_size": 256,
"learning_rate": 0.01,
"optimizer": "AdamW"
},
"metrics": {
"accuracy": 0.72,
"latency_p50_ms": 0.03,
"latency_p90_ms": 0.04,
"latency_p99_ms": 0.06,
"throughput_qps": 33333.0
},
"training": {
"epochs": 25,
"final_train_loss": 0.58,
"final_val_loss": 0.61,
"total_time_s": 5.0,
"curve": [
{"epoch": 1, "train": 0.69, "val": 0.69},
{"epoch": 5, "train": 0.65, "val": 0.66},
{"epoch": 10, "train": 0.62, "val": 0.63},
{"epoch": 15, "train": 0.60, "val": 0.62},
{"epoch": 20, "train": 0.59, "val": 0.61},
{"epoch": 25, "train": 0.58, "val": 0.61}
]
},
"compliance": {"target_met": True, "target": "accuracy > 0.70", "run_count": 5},
"integrity": {
"dataset_hash": "c4d5e6f7a8b9c0d1",
"checkpoint_hash": "2e3f4a5b6c7d8e9f",
"log_hash": "b0c1d2e3f4a5b6c7"
}
}
os.makedirs("submissions/examples", exist_ok=True)
dlrm_path = "submissions/examples/dlrm_bottleneck.json"
with open(dlrm_path, 'w') as f:
json.dump(dlrm_run, f, indent=2)
from src.mlperf.report import generate_report
report_path = generate_report(
dlrm_path,
output_path="submissions/examples/dlrm_bottleneck_report.html"
)
print(f"\n--- Bottleneck Analysis ---")
print(f"DLRM latency: {dlrm_run['metrics']['latency_p50_ms']:.2f}ms (vs NanoGPT ~5.9ms)")
print(f"Throughput: {dlrm_run['metrics']['throughput_qps']:.0f} QPS")
print(f"Embedding size: 943×32 + 1682×32 = 83,200 floats = 336KB")
print(f"L2 cache: ~4MB (Apple M1) → tables fit entirely in cache")
print(f"Production DLRM: terabyte-scale embeddings → memory-bandwidth-bound")
print(f"\nInsight: The architectural bottleneck (sparse vs. dense) is preserved,")
print(f"but the scale-dependent bottleneck (memory bandwidth) is absent.")
print(f"\nReport: {report_path}")
return report_path
def example_architecture_comparison():
"""Workflow 2: Dense vs. Sparse Architecture Comparison.
A practitioner evaluates NanoGPT (dense) vs Nano-MoE (sparse) on the
same dataset to understand the compute/quality tradeoff of expert routing.
"""
print("\n" + "=" * 60)
print("Example 2: Architecture Comparison (Dense vs. Sparse)")
print("=" * 60)
dense_run = {
"workload": "nanogpt-12m",
"division": "cloud", "scenario": "SingleStream",
"timestamp": datetime.datetime.now().isoformat(), "seed": 42,
"hardware_fingerprint": {"cpu": "Apple M1", "gpu": "MPS", "memory_gb": 8, "os": "macOS"},
"config": {"batch_size": 16, "learning_rate": 0.001, "optimizer": "AdamW"},
"metrics": {"loss": 2.25, "latency_p50_ms": 5.90, "throughput_qps": 169.5},
"training": {"epochs": 25, "final_train_loss": 2.25, "final_val_loss": 2.31, "total_time_s": 89.0},
"compliance": {"target_met": True, "target": "loss < 2.3", "run_count": 3},
"integrity": {"dataset_hash": "abc123", "checkpoint_hash": "def456", "log_hash": "ghi789"}
}
sparse_run = {
"workload": "nano-moe-12m",
"division": "cloud", "scenario": "SingleStream",
"timestamp": datetime.datetime.now().isoformat(), "seed": 42,
"hardware_fingerprint": {"cpu": "Apple M1", "gpu": "MPS", "memory_gb": 8, "os": "macOS"},
"config": {"batch_size": 16, "learning_rate": 0.001, "optimizer": "AdamW"},
"metrics": {"loss": 0.042, "latency_p50_ms": 8.20, "throughput_qps": 122.0},
"training": {"epochs": 25, "final_train_loss": 0.042, "final_val_loss": 0.048, "total_time_s": 158.0},
"compliance": {"target_met": True, "target": "loss < 0.05", "run_count": 3},
"integrity": {"dataset_hash": "abc123", "checkpoint_hash": "jkl012", "log_hash": "mno345"}
}
os.makedirs("submissions/examples", exist_ok=True)
with open("submissions/examples/nanogpt_dense.json", 'w') as f:
json.dump(dense_run, f, indent=2)
with open("submissions/examples/nanomoe_sparse.json", 'w') as f:
json.dump(sparse_run, f, indent=2)
from src.mlperf.report import generate_report
generate_report("submissions/examples/nanogpt_dense.json",
output_path="submissions/examples/dense_report.html")
report = generate_report("submissions/examples/nanomoe_sparse.json",
output_path="submissions/examples/sparse_report.html",
baseline_path="submissions/examples/nanogpt_dense.json")
print(f"\n--- Architecture Comparison ---")
print(f"NanoGPT (dense): 85.9M params, loss=2.25, 89s training, 5.9ms inference")
print(f"Nano-MoE (sparse): 17.4M params, loss=0.042, 158s training, 8.2ms inference")
print(f"\nInsight: MoE achieves 54x lower loss with 5x fewer parameters,")
print(f"but at 1.4x inference latency due to routing overhead.")
print(f"This is the fundamental dense-vs-sparse tradeoff in ML systems.")
print(f"\nComparison report: {report}")
if __name__ == '__main__':
print("MLPerf EDU: Industry-Style Workflow Examples")
print("=" * 60)
print()
example_training_optimization()
example_architecture_comparison()
example_bottleneck_analysis()
print("\n" + "=" * 60)
print("All example reports generated in submissions/examples/")
print("Open any _report.html file in a browser to view.")