Files
Vijay Janapa Reddi a9878ad6bd feat: import mlperf-edu pedagogical benchmark suite
Snapshot of the standalone /Users/VJ/GitHub/mlperf-edu/ repo as of
2026-04-16, brought into MLSysBook as a parked feature branch for
backup and iteration. Not for merge to dev.

Contents (88 files, ~2.3 MB):
- 16 reference workloads (cloud / edge / tiny / agent divisions)
- LoadGen proxy harness + SUT plugin protocol
- Compliance checker, autograder, hardware fingerprint
- Paper draft (paper.tex) with TikZ/SVG figure sources
- Three lab examples + practitioner workflow configs
- Workload + dataset YAML registries (single source of truth)

Excluded (per mlperf-edu/.gitignore + size constraints):
- Datasets (6.6 GB), checkpoints (260 MB), gpt2 weights (523 MB)
- Generated PDFs, .venv, build artifacts
2026-04-16 14:15:05 -04:00

134 lines
4.4 KiB
Python

"""
MLPerf EDU: Nano-MoE (Cloud Division)
A sparse Mixture-of-Experts language model with 8 experts and top-2
routing, mapping the MLPerf Training Switch Transformer benchmark
to laptop scale.
Architecture:
Token embedding + positional embedding
→ N layers of [Self-Attention + Sparse MoE FFN]
→ Language model head
The MoE layer routes each token to 2 of 8 experts, demonstrating:
- Sparse computation: only 25% of expert parameters activate per token
- Total vs. active parameter distinction (17.4M total, ~5M active)
- Routing overhead and load balancing challenges
Quality Target: Cross-entropy loss < 0.05 on TinyShakespeare
Provenance: Shazeer et al. 2017, "Outrageously Large Neural Networks"
"""
import torch
import torch.nn as nn
import torch.nn.functional as F
class Expert(nn.Module):
"""Single expert: a standard 2-layer FFN with SiLU activation."""
def __init__(self, d_model: int, d_hidden: int):
super().__init__()
self.w1 = nn.Linear(d_model, d_hidden, bias=False)
self.w2 = nn.Linear(d_hidden, d_model, bias=False)
def forward(self, x):
return self.w2(F.silu(self.w1(x)))
class SparseMoERouter(nn.Module):
"""
Top-K sparse Mixture-of-Experts layer.
Routes each token to top_k experts based on a learned gating function.
Students can measure:
- Load balance across experts (are all experts used equally?)
- Routing overhead (gate computation + gather/scatter)
- Memory: all experts are in memory, but only top_k compute per token
"""
def __init__(self, d_model: int, num_experts: int = 8, top_k: int = 2):
super().__init__()
self.num_experts = num_experts
self.top_k = top_k
self.gate = nn.Linear(d_model, num_experts, bias=False)
self.experts = nn.ModuleList([
Expert(d_model, d_model * 4) for _ in range(num_experts)
])
def forward(self, x):
B, T, D = x.shape
x_flat = x.view(-1, D)
# Compute routing probabilities
router_logits = self.gate(x_flat)
routing_weights = F.softmax(router_logits, dim=1)
# Select top-k experts per token
routing_weights, selected_experts = torch.topk(
routing_weights, self.top_k, dim=-1
)
routing_weights = routing_weights / routing_weights.sum(dim=-1, keepdim=True)
# Dispatch tokens to experts and gather results
final_output = torch.zeros_like(x_flat)
for i, expert in enumerate(self.experts):
idx, nth_expert = torch.where(selected_experts == i)
if idx.shape[0] > 0:
expert_out = expert(x_flat[idx])
final_output[idx] += expert_out * routing_weights[idx, nth_expert, None]
return final_output.view(B, T, D)
class NanoMoEWhiteBox(nn.Module):
"""
Sparse MoE language model (17.4M parameters).
Replaces the standard FFN in each transformer layer with a
SparseMoERouter, demonstrating sparse conditional computation.
"""
def __init__(self, vocab_size=50257, d_model=128, n_heads=4, n_layers=4):
super().__init__()
self.token_emb = nn.Embedding(vocab_size, d_model)
self.pos_emb = nn.Embedding(256, d_model)
self.layers = nn.ModuleList([
nn.ModuleDict(dict(
ln_1=nn.LayerNorm(d_model),
attn=nn.MultiheadAttention(d_model, n_heads, batch_first=True),
ln_2=nn.LayerNorm(d_model),
moe=SparseMoERouter(d_model, num_experts=8, top_k=2),
))
for _ in range(n_layers)
])
self.ln_f = nn.LayerNorm(d_model)
self.lm_head = nn.Linear(d_model, vocab_size, bias=False)
def forward(self, idx, targets=None):
B, T = idx.size()
pos = torch.arange(0, T, dtype=torch.long, device=idx.device)
x = self.token_emb(idx) + self.pos_emb(pos)
for block in self.layers:
# Self-attention with residual
attn_out, _ = block['attn'](
block['ln_1'](x), block['ln_1'](x), block['ln_1'](x),
need_weights=False
)
x = x + attn_out
# Sparse MoE FFN with residual
x = x + block['moe'](block['ln_2'](x))
logits = self.lm_head(self.ln_f(x))
loss = None
if targets is not None:
loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1))
return logits, loss