mirror of
https://github.com/harvard-edge/cs249r_book.git
synced 2026-05-22 05:53:13 -05:00
Snapshot of the standalone /Users/VJ/GitHub/mlperf-edu/ repo as of 2026-04-16, brought into MLSysBook as a parked feature branch for backup and iteration. Not for merge to dev. Contents (88 files, ~2.3 MB): - 16 reference workloads (cloud / edge / tiny / agent divisions) - LoadGen proxy harness + SUT plugin protocol - Compliance checker, autograder, hardware fingerprint - Paper draft (paper.tex) with TikZ/SVG figure sources - Three lab examples + practitioner workflow configs - Workload + dataset YAML registries (single source of truth) Excluded (per mlperf-edu/.gitignore + size constraints): - Datasets (6.6 GB), checkpoints (260 MB), gpt2 weights (523 MB) - Generated PDFs, .venv, build artifacts
134 lines
4.4 KiB
Python
134 lines
4.4 KiB
Python
"""
|
|
MLPerf EDU: Nano-MoE (Cloud Division)
|
|
|
|
A sparse Mixture-of-Experts language model with 8 experts and top-2
|
|
routing, mapping the MLPerf Training Switch Transformer benchmark
|
|
to laptop scale.
|
|
|
|
Architecture:
|
|
Token embedding + positional embedding
|
|
→ N layers of [Self-Attention + Sparse MoE FFN]
|
|
→ Language model head
|
|
|
|
The MoE layer routes each token to 2 of 8 experts, demonstrating:
|
|
- Sparse computation: only 25% of expert parameters activate per token
|
|
- Total vs. active parameter distinction (17.4M total, ~5M active)
|
|
- Routing overhead and load balancing challenges
|
|
|
|
Quality Target: Cross-entropy loss < 0.05 on TinyShakespeare
|
|
|
|
Provenance: Shazeer et al. 2017, "Outrageously Large Neural Networks"
|
|
"""
|
|
|
|
import torch
|
|
import torch.nn as nn
|
|
import torch.nn.functional as F
|
|
|
|
|
|
class Expert(nn.Module):
|
|
"""Single expert: a standard 2-layer FFN with SiLU activation."""
|
|
|
|
def __init__(self, d_model: int, d_hidden: int):
|
|
super().__init__()
|
|
self.w1 = nn.Linear(d_model, d_hidden, bias=False)
|
|
self.w2 = nn.Linear(d_hidden, d_model, bias=False)
|
|
|
|
def forward(self, x):
|
|
return self.w2(F.silu(self.w1(x)))
|
|
|
|
|
|
class SparseMoERouter(nn.Module):
|
|
"""
|
|
Top-K sparse Mixture-of-Experts layer.
|
|
|
|
Routes each token to top_k experts based on a learned gating function.
|
|
Students can measure:
|
|
- Load balance across experts (are all experts used equally?)
|
|
- Routing overhead (gate computation + gather/scatter)
|
|
- Memory: all experts are in memory, but only top_k compute per token
|
|
"""
|
|
|
|
def __init__(self, d_model: int, num_experts: int = 8, top_k: int = 2):
|
|
super().__init__()
|
|
self.num_experts = num_experts
|
|
self.top_k = top_k
|
|
self.gate = nn.Linear(d_model, num_experts, bias=False)
|
|
self.experts = nn.ModuleList([
|
|
Expert(d_model, d_model * 4) for _ in range(num_experts)
|
|
])
|
|
|
|
def forward(self, x):
|
|
B, T, D = x.shape
|
|
x_flat = x.view(-1, D)
|
|
|
|
# Compute routing probabilities
|
|
router_logits = self.gate(x_flat)
|
|
routing_weights = F.softmax(router_logits, dim=1)
|
|
|
|
# Select top-k experts per token
|
|
routing_weights, selected_experts = torch.topk(
|
|
routing_weights, self.top_k, dim=-1
|
|
)
|
|
routing_weights = routing_weights / routing_weights.sum(dim=-1, keepdim=True)
|
|
|
|
# Dispatch tokens to experts and gather results
|
|
final_output = torch.zeros_like(x_flat)
|
|
for i, expert in enumerate(self.experts):
|
|
idx, nth_expert = torch.where(selected_experts == i)
|
|
if idx.shape[0] > 0:
|
|
expert_out = expert(x_flat[idx])
|
|
final_output[idx] += expert_out * routing_weights[idx, nth_expert, None]
|
|
|
|
return final_output.view(B, T, D)
|
|
|
|
|
|
class NanoMoEWhiteBox(nn.Module):
|
|
"""
|
|
Sparse MoE language model (17.4M parameters).
|
|
|
|
Replaces the standard FFN in each transformer layer with a
|
|
SparseMoERouter, demonstrating sparse conditional computation.
|
|
"""
|
|
|
|
def __init__(self, vocab_size=50257, d_model=128, n_heads=4, n_layers=4):
|
|
super().__init__()
|
|
self.token_emb = nn.Embedding(vocab_size, d_model)
|
|
self.pos_emb = nn.Embedding(256, d_model)
|
|
|
|
self.layers = nn.ModuleList([
|
|
nn.ModuleDict(dict(
|
|
ln_1=nn.LayerNorm(d_model),
|
|
attn=nn.MultiheadAttention(d_model, n_heads, batch_first=True),
|
|
ln_2=nn.LayerNorm(d_model),
|
|
moe=SparseMoERouter(d_model, num_experts=8, top_k=2),
|
|
))
|
|
for _ in range(n_layers)
|
|
])
|
|
|
|
self.ln_f = nn.LayerNorm(d_model)
|
|
self.lm_head = nn.Linear(d_model, vocab_size, bias=False)
|
|
|
|
def forward(self, idx, targets=None):
|
|
B, T = idx.size()
|
|
pos = torch.arange(0, T, dtype=torch.long, device=idx.device)
|
|
|
|
x = self.token_emb(idx) + self.pos_emb(pos)
|
|
|
|
for block in self.layers:
|
|
# Self-attention with residual
|
|
attn_out, _ = block['attn'](
|
|
block['ln_1'](x), block['ln_1'](x), block['ln_1'](x),
|
|
need_weights=False
|
|
)
|
|
x = x + attn_out
|
|
# Sparse MoE FFN with residual
|
|
x = x + block['moe'](block['ln_2'](x))
|
|
|
|
logits = self.lm_head(self.ln_f(x))
|
|
|
|
loss = None
|
|
if targets is not None:
|
|
loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1))
|
|
|
|
return logits, loss
|