Files
cs249r_book/mlperf-edu/reference/cloud/micro_bert.py
Vijay Janapa Reddi a9878ad6bd feat: import mlperf-edu pedagogical benchmark suite
Snapshot of the standalone /Users/VJ/GitHub/mlperf-edu/ repo as of
2026-04-16, brought into MLSysBook as a parked feature branch for
backup and iteration. Not for merge to dev.

Contents (88 files, ~2.3 MB):
- 16 reference workloads (cloud / edge / tiny / agent divisions)
- LoadGen proxy harness + SUT plugin protocol
- Compliance checker, autograder, hardware fingerprint
- Paper draft (paper.tex) with TikZ/SVG figure sources
- Three lab examples + practitioner workflow configs
- Workload + dataset YAML registries (single source of truth)

Excluded (per mlperf-edu/.gitignore + size constraints):
- Datasets (6.6 GB), checkpoints (260 MB), gpt2 weights (523 MB)
- Generated PDFs, .venv, build artifacts
2026-04-16 14:15:05 -04:00

328 lines
11 KiB
Python

"""
MLPerf EDU: Micro-BERT — Text Classification Workload
======================================================
Provenance: Devlin et al. 2019, "BERT: Pre-training of Deep
Bidirectional Transformers for Language Understanding"
Maps to: MLPerf Training BERT
This implements a micro-scale bidirectional transformer encoder
for binary sentiment classification on the **SST-2** dataset
(Stanford Sentiment Treebank, Socher et al. 2013).
Dataset: SST-2 (Stanford Sentiment Treebank — Binary)
- 67,349 training sentences from movie reviews
- 872 validation sentences
- Binary labels: 0 = negative, 1 = positive
- Ships locally in data/sst2/SST-2/ (3.8 MB)
- Provenance: Socher et al., EMNLP 2013
Pedagogical concepts:
- Bidirectional self-attention (vs causal in GPT)
- [CLS] token pooling for classification
- Position embeddings for sequence ordering
- Fine-tuning paradigm (pre-train + task head)
- Real NLU challenge: sarcasm, negation, context
Architecture:
CharEmbed + PosEmbed → TransformerEncoder(2 layers, 4 heads, d=128)
→ [CLS] pooling → Linear(128, 2)
Total: ~0.5M parameters
Target: Val accuracy > 0.78 (with character-level tokenization)
"""
import os
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
from collections import Counter
from torch.utils.data import Dataset, DataLoader
REPO_ROOT = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", ".."))
SST2_DIR = os.path.join(REPO_ROOT, "data", "sst2", "SST-2")
# Special token indices
PAD_IDX = 0
CLS_IDX = 1
SEP_IDX = 2
UNK_IDX = 3
def build_vocab(path, max_vocab=5000):
"""Build word-level vocabulary from SST-2 training data."""
counter = Counter()
with open(path, "r", encoding="utf-8") as f:
f.readline() # skip header
for line in f:
parts = line.strip().split("\t")
if len(parts) >= 2:
words = parts[0].lower().split()
counter.update(words)
# Top max_vocab words + 4 special tokens
vocab = {"[PAD]": PAD_IDX, "[CLS]": CLS_IDX, "[SEP]": SEP_IDX, "[UNK]": UNK_IDX}
for word, _ in counter.most_common(max_vocab):
vocab[word] = len(vocab)
return vocab
def word_encode(text, vocab, max_len=64):
"""Encode text as word-level tokens with [CLS] and [SEP]."""
tokens = [CLS_IDX]
for word in text.lower().split()[:max_len - 2]:
tokens.append(vocab.get(word, UNK_IDX))
tokens.append(SEP_IDX)
if len(tokens) < max_len:
tokens += [PAD_IDX] * (max_len - len(tokens))
return tokens
# ============================================================================
# SST-2 Dataset — real binary sentiment from Stanford NLP
# ============================================================================
class SST2Dataset(Dataset):
"""
SST-2 binary sentiment classification dataset.
Loads real movie review sentences from the Stanford Sentiment
Treebank. Each sentence is a short movie review fragment with
a binary label (0=negative, 1=positive).
Uses word-level whitespace tokenization with a 5K vocabulary built
from the training data — no external tokenizer dependency needed.
Args:
split: "train" or "dev"
vocab: Word-to-index dictionary (built from training data)
max_len: Maximum sequence length
max_samples: Limit samples for faster training (None = all)
"""
def __init__(self, split="train", vocab=None, max_len=64, max_samples=None):
super().__init__()
self.max_len = max_len
if split == "train":
path = os.path.join(SST2_DIR, "train.tsv")
else:
path = os.path.join(SST2_DIR, "dev.tsv")
if not os.path.exists(path):
raise FileNotFoundError(
f"SST-2 dataset not found at {path}. "
"Download: cd data/sst2 && curl -sL "
"'https://dl.fbaipublicfiles.com/glue/data/SST-2.zip' "
"-o SST-2.zip && unzip SST-2.zip"
)
# Build vocab from training data if not provided
if vocab is None:
train_path = os.path.join(SST2_DIR, "train.tsv")
vocab = build_vocab(train_path)
self.vocab = vocab
self.texts = []
self.labels = []
with open(path, "r", encoding="utf-8") as f:
header = f.readline() # Skip header
for line in f:
parts = line.strip().split("\t")
if len(parts) >= 2:
sentence = parts[0]
label = int(parts[1])
self.texts.append(word_encode(sentence, vocab, max_len))
self.labels.append(label)
if max_samples is not None:
self.texts = self.texts[:max_samples]
self.labels = self.labels[:max_samples]
self.texts = torch.tensor(self.texts, dtype=torch.long)
self.labels = torch.tensor(self.labels, dtype=torch.long)
def __len__(self):
return len(self.labels)
def __getitem__(self, idx):
return self.texts[idx], self.labels[idx]
# ============================================================================
# Micro-BERT Model
# ============================================================================
class MicroBERT(nn.Module):
"""
Micro-scale BERT encoder for binary sentiment classification.
Key differences from GPT:
- Bidirectional attention (no causal mask)
- [CLS] token pooling for classification
- Smaller scale: 2 layers, 4 heads, d=128
Args:
vocab_size: Vocabulary size (~5004 for word-level)
d_model: Hidden dimension
nhead: Number of attention heads
num_layers: Number of transformer encoder layers
max_len: Maximum sequence length
num_classes: Number of classification labels (2 for SST-2)
"""
def __init__(self, vocab_size=5004, d_model=128, nhead=4,
num_layers=2, max_len=64, num_classes=2, dropout=0.1):
super().__init__()
self.d_model = d_model
# Embeddings
self.token_embed = nn.Embedding(vocab_size, d_model, padding_idx=PAD_IDX)
self.pos_embed = nn.Embedding(max_len, d_model)
self.embed_dropout = nn.Dropout(dropout)
self.embed_norm = nn.LayerNorm(d_model)
# Transformer encoder (bidirectional — no causal mask!)
encoder_layer = nn.TransformerEncoderLayer(
d_model=d_model, nhead=nhead,
dim_feedforward=d_model * 4,
dropout=dropout, activation='gelu',
batch_first=True,
)
self.encoder = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)
# Classification head: [CLS] → Linear → Softmax
self.classifier = nn.Sequential(
nn.Linear(d_model, d_model),
nn.Tanh(),
nn.Dropout(dropout),
nn.Linear(d_model, num_classes),
)
def forward(self, input_ids, targets=None):
"""
Args:
input_ids: Token indices (B, seq_len)
targets: Optional class labels (B,) for computing loss
Returns:
logits: (B, num_classes)
loss: Cross-entropy loss (if targets provided)
"""
B, T = input_ids.shape
# Embed tokens + positions
positions = torch.arange(T, device=input_ids.device).unsqueeze(0)
x = self.token_embed(input_ids) + self.pos_embed(positions)
x = self.embed_norm(self.embed_dropout(x))
# Create padding mask
pad_mask = (input_ids == PAD_IDX) # True where padded
# Bidirectional transformer encoding
x = self.encoder(x, src_key_padding_mask=pad_mask)
# Pool: take the [CLS] token (position 0)
cls_output = x[:, 0, :] # (B, d_model)
# Classify
logits = self.classifier(cls_output) # (B, num_classes)
loss = None
if targets is not None:
loss = F.cross_entropy(logits, targets)
return logits, loss
def get_bert_dataloaders(batch_size=64, seed=42, max_train=10000):
"""
Create train/val DataLoaders for SST-2 sentiment classification.
Uses a 10K subset of training data by default for fast iteration
(full 67K available by setting max_train=None).
"""
# Build vocabulary from full training data
vocab = build_vocab(os.path.join(SST2_DIR, "train.tsv"))
train_ds = SST2Dataset(split="train", vocab=vocab, max_samples=max_train)
val_ds = SST2Dataset(split="dev", vocab=vocab)
train_loader = DataLoader(
train_ds, batch_size=batch_size, shuffle=True,
generator=torch.Generator().manual_seed(seed),
)
val_loader = DataLoader(val_ds, batch_size=batch_size, shuffle=False)
return train_loader, val_loader, len(vocab)
# Training loop for standalone testing
def train_and_evaluate(epochs=20, batch_size=64, lr=1e-3, seed=42):
"""Train MicroBERT on SST-2 and report convergence."""
torch.manual_seed(seed)
train_loader, val_loader, vocab_size = get_bert_dataloaders(
batch_size=batch_size, seed=seed
)
model = MicroBERT(vocab_size=vocab_size)
optimizer = torch.optim.AdamW(model.parameters(), lr=lr, weight_decay=0.01)
print(f" SST-2: {len(train_loader.dataset)} train, "
f"{len(val_loader.dataset)} val sentences")
for epoch in range(epochs):
model.train()
train_loss = 0
correct = 0
total = 0
for x, y in train_loader:
optimizer.zero_grad()
logits, loss = model(x, targets=y)
loss.backward()
optimizer.step()
train_loss += loss.item()
correct += (logits.argmax(1) == y).sum().item()
total += y.size(0)
train_loss /= len(train_loader)
train_acc = correct / total
model.eval()
val_loss = 0
correct = 0
total = 0
with torch.no_grad():
for x, y in val_loader:
logits, loss = model(x, targets=y)
val_loss += loss.item()
correct += (logits.argmax(1) == y).sum().item()
total += y.size(0)
val_loss /= len(val_loader)
val_acc = correct / total
if (epoch + 1) % 4 == 0:
print(f" Epoch {epoch+1:3d}: train_loss={train_loss:.4f} "
f"train_acc={train_acc:.3f} "
f"val_loss={val_loss:.4f} val_acc={val_acc:.3f}")
n_params = sum(p.numel() for p in model.parameters())
return {
"final_train_loss": train_loss,
"final_val_loss": val_loss,
"final_train_acc": train_acc,
"final_val_acc": val_acc,
"n_params": n_params,
}
if __name__ == "__main__":
vocab = build_vocab(os.path.join(SST2_DIR, "train.tsv"))
model = MicroBERT(vocab_size=len(vocab))
n_params = sum(p.numel() for p in model.parameters())
print(f"MicroBERT: {n_params:,} parameters")
print(f"Vocab: {len(vocab)} (word-level from SST-2)")
print()
print("Training on SST-2 sentiment classification...")
results = train_and_evaluate(epochs=20)
print(f"\n✅ Results:")
print(f" Final val accuracy: {results['final_val_acc']:.3f}")
print(f" Final val loss: {results['final_val_loss']:.4f}")
print(f" Parameters: {results['n_params']:,}")