mirror of
https://github.com/harvard-edge/cs249r_book.git
synced 2026-05-07 18:18:42 -05:00
Snapshot of the standalone /Users/VJ/GitHub/mlperf-edu/ repo as of 2026-04-16, brought into MLSysBook as a parked feature branch for backup and iteration. Not for merge to dev. Contents (88 files, ~2.3 MB): - 16 reference workloads (cloud / edge / tiny / agent divisions) - LoadGen proxy harness + SUT plugin protocol - Compliance checker, autograder, hardware fingerprint - Paper draft (paper.tex) with TikZ/SVG figure sources - Three lab examples + practitioner workflow configs - Workload + dataset YAML registries (single source of truth) Excluded (per mlperf-edu/.gitignore + size constraints): - Datasets (6.6 GB), checkpoints (260 MB), gpt2 weights (523 MB) - Generated PDFs, .venv, build artifacts
319 lines
12 KiB
Python
319 lines
12 KiB
Python
# MLPerf EDU: Practitioner Workflow Examples
|
||
# ==========================================
|
||
# These examples mirror how MLPerf is actually used in industry.
|
||
# Each follows: Problem → Initial Run → Modification → New Run → Report → Insight
|
||
#
|
||
# Usage: Run these scripts end-to-end to produce comparison reports.
|
||
|
||
"""
|
||
Example 1: Training Optimization Workflow
|
||
=========================================
|
||
Problem: "Reduce NanoGPT training time by 20% without sacrificing quality."
|
||
|
||
This mirrors what ML engineers do at NVIDIA/Google when optimizing
|
||
training throughput for a new hardware platform.
|
||
"""
|
||
|
||
import json
|
||
import os
|
||
import time
|
||
import hashlib
|
||
import datetime
|
||
|
||
|
||
def example_training_optimization():
|
||
"""Workflow 1: Optimize training throughput.
|
||
|
||
A practitioner profiles NanoGPT training and applies three optimizations:
|
||
1. Increase batch size (8 → 64)
|
||
2. Enable mixed precision (FP32 → FP16)
|
||
3. Optimize data loading (num_workers)
|
||
|
||
The report comparison shows the latency-accuracy tradeoff.
|
||
"""
|
||
print("=" * 60)
|
||
print("Example 1: Training Optimization Workflow")
|
||
print("=" * 60)
|
||
|
||
# --- Baseline Run ---
|
||
baseline = {
|
||
"workload": "nanogpt-12m",
|
||
"division": "cloud",
|
||
"scenario": "SingleStream",
|
||
"timestamp": datetime.datetime.now().isoformat(),
|
||
"seed": 42,
|
||
"hardware_fingerprint": {
|
||
"cpu": "Apple M1",
|
||
"gpu": "Apple M1 GPU (MPS)",
|
||
"memory_gb": 8,
|
||
"os": "macOS 15.4.1"
|
||
},
|
||
"config": {
|
||
"batch_size": 8,
|
||
"learning_rate": 0.001,
|
||
"optimizer": "AdamW",
|
||
"precision": "fp32",
|
||
"num_workers": 0
|
||
},
|
||
"metrics": {
|
||
"loss": 2.28,
|
||
"latency_p50_ms": 12.4,
|
||
"latency_p90_ms": 15.1,
|
||
"latency_p99_ms": 18.3,
|
||
"throughput_qps": 80.6,
|
||
"power_avg_watts": 11.2,
|
||
"energy_joules": 1780.0
|
||
},
|
||
"training": {
|
||
"epochs": 25,
|
||
"final_train_loss": 2.28,
|
||
"final_val_loss": 2.35,
|
||
"total_time_s": 178.0,
|
||
"curve": [
|
||
{"epoch": 1, "train": 4.28, "val": 4.31},
|
||
{"epoch": 5, "train": 3.55, "val": 3.61},
|
||
{"epoch": 10, "train": 2.92, "val": 3.01},
|
||
{"epoch": 15, "train": 2.60, "val": 2.71},
|
||
{"epoch": 20, "train": 2.40, "val": 2.49},
|
||
{"epoch": 25, "train": 2.28, "val": 2.35}
|
||
]
|
||
},
|
||
"compliance": {"target_met": True, "target": "loss < 2.3", "run_count": 3},
|
||
"integrity": {
|
||
"dataset_hash": "a3f2b8c9d4e5f6a7",
|
||
"checkpoint_hash": "1b2c3d4e5f6a7b8c",
|
||
"log_hash": "f0e1d2c3b4a59687"
|
||
}
|
||
}
|
||
|
||
# --- Optimized Run (batch_size=64, fp16, num_workers=4) ---
|
||
optimized = {
|
||
"workload": "nanogpt-12m",
|
||
"division": "cloud",
|
||
"scenario": "SingleStream",
|
||
"timestamp": datetime.datetime.now().isoformat(),
|
||
"seed": 42,
|
||
"hardware_fingerprint": baseline["hardware_fingerprint"],
|
||
"config": {
|
||
"batch_size": 64,
|
||
"learning_rate": 0.001,
|
||
"optimizer": "AdamW",
|
||
"precision": "fp16",
|
||
"num_workers": 4
|
||
},
|
||
"metrics": {
|
||
"loss": 2.25,
|
||
"latency_p50_ms": 5.90,
|
||
"latency_p90_ms": 7.20,
|
||
"latency_p99_ms": 9.10,
|
||
"throughput_qps": 169.5,
|
||
"power_avg_watts": 12.3,
|
||
"energy_joules": 1094.7
|
||
},
|
||
"training": {
|
||
"epochs": 25,
|
||
"final_train_loss": 2.25,
|
||
"final_val_loss": 2.31,
|
||
"total_time_s": 89.0,
|
||
"curve": [
|
||
{"epoch": 1, "train": 4.25, "val": 4.28},
|
||
{"epoch": 5, "train": 3.45, "val": 3.50},
|
||
{"epoch": 10, "train": 2.82, "val": 2.88},
|
||
{"epoch": 15, "train": 2.50, "val": 2.57},
|
||
{"epoch": 20, "train": 2.32, "val": 2.39},
|
||
{"epoch": 25, "train": 2.25, "val": 2.31}
|
||
]
|
||
},
|
||
"compliance": {"target_met": True, "target": "loss < 2.3", "run_count": 3},
|
||
"integrity": {
|
||
"dataset_hash": "a3f2b8c9d4e5f6a7",
|
||
"checkpoint_hash": "9e8d7c6b5a4f3e2d",
|
||
"log_hash": "a1b2c3d4e5f6a7b8"
|
||
}
|
||
}
|
||
|
||
# Save submissions
|
||
os.makedirs("submissions/examples", exist_ok=True)
|
||
|
||
baseline_path = "submissions/examples/nanogpt_baseline.json"
|
||
optimized_path = "submissions/examples/nanogpt_optimized.json"
|
||
|
||
with open(baseline_path, 'w') as f:
|
||
json.dump(baseline, f, indent=2)
|
||
with open(optimized_path, 'w') as f:
|
||
json.dump(optimized, f, indent=2)
|
||
|
||
# Generate comparative report
|
||
from src.mlperf.report import generate_report
|
||
|
||
report_path = generate_report(
|
||
optimized_path,
|
||
output_path="submissions/examples/nanogpt_optimization_report.html",
|
||
baseline_path=baseline_path
|
||
)
|
||
|
||
# Print analysis
|
||
speedup = baseline["training"]["total_time_s"] / optimized["training"]["total_time_s"]
|
||
energy_savings = 1.0 - (optimized["metrics"]["energy_joules"] / baseline["metrics"]["energy_joules"])
|
||
throughput_gain = optimized["metrics"]["throughput_qps"] / baseline["metrics"]["throughput_qps"]
|
||
|
||
print(f"\n--- Analysis ---")
|
||
print(f"Training speedup: {speedup:.1f}x ({baseline['training']['total_time_s']}s → {optimized['training']['total_time_s']}s)")
|
||
print(f"Throughput gain: {throughput_gain:.1f}x ({baseline['metrics']['throughput_qps']:.0f} → {optimized['metrics']['throughput_qps']:.0f} QPS)")
|
||
print(f"Energy savings: {energy_savings*100:.0f}% ({baseline['metrics']['energy_joules']:.0f}J → {optimized['metrics']['energy_joules']:.0f}J)")
|
||
print(f"Quality preserved: loss {baseline['metrics']['loss']:.2f} → {optimized['metrics']['loss']:.2f} (target: <2.3)")
|
||
print(f"\nReport: {report_path}")
|
||
|
||
return report_path
|
||
|
||
|
||
def example_bottleneck_analysis():
|
||
"""Workflow 4: System Bottleneck Identification.
|
||
|
||
A system designer investigates why DLRM training is unexpectedly fast
|
||
despite having embedding tables. Discovers the micro-scale table fits
|
||
in cache, illustrating how production bottlenecks differ at scale.
|
||
"""
|
||
print("\n" + "=" * 60)
|
||
print("Example 4: System Bottleneck Analysis")
|
||
print("=" * 60)
|
||
|
||
dlrm_run = {
|
||
"workload": "micro-dlrm-1m",
|
||
"division": "cloud",
|
||
"scenario": "Offline",
|
||
"timestamp": datetime.datetime.now().isoformat(),
|
||
"seed": 42,
|
||
"hardware_fingerprint": {
|
||
"cpu": "Apple M1",
|
||
"gpu": "Apple M1 GPU (MPS)",
|
||
"memory_gb": 8,
|
||
"os": "macOS 15.4.1"
|
||
},
|
||
"config": {
|
||
"batch_size": 256,
|
||
"learning_rate": 0.01,
|
||
"optimizer": "AdamW"
|
||
},
|
||
"metrics": {
|
||
"accuracy": 0.72,
|
||
"latency_p50_ms": 0.03,
|
||
"latency_p90_ms": 0.04,
|
||
"latency_p99_ms": 0.06,
|
||
"throughput_qps": 33333.0
|
||
},
|
||
"training": {
|
||
"epochs": 25,
|
||
"final_train_loss": 0.58,
|
||
"final_val_loss": 0.61,
|
||
"total_time_s": 5.0,
|
||
"curve": [
|
||
{"epoch": 1, "train": 0.69, "val": 0.69},
|
||
{"epoch": 5, "train": 0.65, "val": 0.66},
|
||
{"epoch": 10, "train": 0.62, "val": 0.63},
|
||
{"epoch": 15, "train": 0.60, "val": 0.62},
|
||
{"epoch": 20, "train": 0.59, "val": 0.61},
|
||
{"epoch": 25, "train": 0.58, "val": 0.61}
|
||
]
|
||
},
|
||
"compliance": {"target_met": True, "target": "accuracy > 0.70", "run_count": 5},
|
||
"integrity": {
|
||
"dataset_hash": "c4d5e6f7a8b9c0d1",
|
||
"checkpoint_hash": "2e3f4a5b6c7d8e9f",
|
||
"log_hash": "b0c1d2e3f4a5b6c7"
|
||
}
|
||
}
|
||
|
||
os.makedirs("submissions/examples", exist_ok=True)
|
||
dlrm_path = "submissions/examples/dlrm_bottleneck.json"
|
||
with open(dlrm_path, 'w') as f:
|
||
json.dump(dlrm_run, f, indent=2)
|
||
|
||
from src.mlperf.report import generate_report
|
||
report_path = generate_report(
|
||
dlrm_path,
|
||
output_path="submissions/examples/dlrm_bottleneck_report.html"
|
||
)
|
||
|
||
print(f"\n--- Bottleneck Analysis ---")
|
||
print(f"DLRM latency: {dlrm_run['metrics']['latency_p50_ms']:.2f}ms (vs NanoGPT ~5.9ms)")
|
||
print(f"Throughput: {dlrm_run['metrics']['throughput_qps']:.0f} QPS")
|
||
print(f"Embedding size: 943×32 + 1682×32 = 83,200 floats = 336KB")
|
||
print(f"L2 cache: ~4MB (Apple M1) → tables fit entirely in cache")
|
||
print(f"Production DLRM: terabyte-scale embeddings → memory-bandwidth-bound")
|
||
print(f"\nInsight: The architectural bottleneck (sparse vs. dense) is preserved,")
|
||
print(f"but the scale-dependent bottleneck (memory bandwidth) is absent.")
|
||
print(f"\nReport: {report_path}")
|
||
|
||
return report_path
|
||
|
||
|
||
def example_architecture_comparison():
|
||
"""Workflow 2: Dense vs. Sparse Architecture Comparison.
|
||
|
||
A practitioner evaluates NanoGPT (dense) vs Nano-MoE (sparse) on the
|
||
same dataset to understand the compute/quality tradeoff of expert routing.
|
||
"""
|
||
print("\n" + "=" * 60)
|
||
print("Example 2: Architecture Comparison (Dense vs. Sparse)")
|
||
print("=" * 60)
|
||
|
||
dense_run = {
|
||
"workload": "nanogpt-12m",
|
||
"division": "cloud", "scenario": "SingleStream",
|
||
"timestamp": datetime.datetime.now().isoformat(), "seed": 42,
|
||
"hardware_fingerprint": {"cpu": "Apple M1", "gpu": "MPS", "memory_gb": 8, "os": "macOS"},
|
||
"config": {"batch_size": 16, "learning_rate": 0.001, "optimizer": "AdamW"},
|
||
"metrics": {"loss": 2.25, "latency_p50_ms": 5.90, "throughput_qps": 169.5},
|
||
"training": {"epochs": 25, "final_train_loss": 2.25, "final_val_loss": 2.31, "total_time_s": 89.0},
|
||
"compliance": {"target_met": True, "target": "loss < 2.3", "run_count": 3},
|
||
"integrity": {"dataset_hash": "abc123", "checkpoint_hash": "def456", "log_hash": "ghi789"}
|
||
}
|
||
|
||
sparse_run = {
|
||
"workload": "nano-moe-12m",
|
||
"division": "cloud", "scenario": "SingleStream",
|
||
"timestamp": datetime.datetime.now().isoformat(), "seed": 42,
|
||
"hardware_fingerprint": {"cpu": "Apple M1", "gpu": "MPS", "memory_gb": 8, "os": "macOS"},
|
||
"config": {"batch_size": 16, "learning_rate": 0.001, "optimizer": "AdamW"},
|
||
"metrics": {"loss": 0.042, "latency_p50_ms": 8.20, "throughput_qps": 122.0},
|
||
"training": {"epochs": 25, "final_train_loss": 0.042, "final_val_loss": 0.048, "total_time_s": 158.0},
|
||
"compliance": {"target_met": True, "target": "loss < 0.05", "run_count": 3},
|
||
"integrity": {"dataset_hash": "abc123", "checkpoint_hash": "jkl012", "log_hash": "mno345"}
|
||
}
|
||
|
||
os.makedirs("submissions/examples", exist_ok=True)
|
||
with open("submissions/examples/nanogpt_dense.json", 'w') as f:
|
||
json.dump(dense_run, f, indent=2)
|
||
with open("submissions/examples/nanomoe_sparse.json", 'w') as f:
|
||
json.dump(sparse_run, f, indent=2)
|
||
|
||
from src.mlperf.report import generate_report
|
||
generate_report("submissions/examples/nanogpt_dense.json",
|
||
output_path="submissions/examples/dense_report.html")
|
||
report = generate_report("submissions/examples/nanomoe_sparse.json",
|
||
output_path="submissions/examples/sparse_report.html",
|
||
baseline_path="submissions/examples/nanogpt_dense.json")
|
||
|
||
print(f"\n--- Architecture Comparison ---")
|
||
print(f"NanoGPT (dense): 85.9M params, loss=2.25, 89s training, 5.9ms inference")
|
||
print(f"Nano-MoE (sparse): 17.4M params, loss=0.042, 158s training, 8.2ms inference")
|
||
print(f"\nInsight: MoE achieves 54x lower loss with 5x fewer parameters,")
|
||
print(f"but at 1.4x inference latency due to routing overhead.")
|
||
print(f"This is the fundamental dense-vs-sparse tradeoff in ML systems.")
|
||
print(f"\nComparison report: {report}")
|
||
|
||
|
||
if __name__ == '__main__':
|
||
print("MLPerf EDU: Industry-Style Workflow Examples")
|
||
print("=" * 60)
|
||
print()
|
||
|
||
example_training_optimization()
|
||
example_architecture_comparison()
|
||
example_bottleneck_analysis()
|
||
|
||
print("\n" + "=" * 60)
|
||
print("All example reports generated in submissions/examples/")
|
||
print("Open any _report.html file in a browser to view.")
|