mirror of
https://github.com/harvard-edge/cs249r_book.git
synced 2026-05-08 18:01:20 -05:00
Snapshot of the standalone /Users/VJ/GitHub/mlperf-edu/ repo as of 2026-04-16, brought into MLSysBook as a parked feature branch for backup and iteration. Not for merge to dev. Contents (88 files, ~2.3 MB): - 16 reference workloads (cloud / edge / tiny / agent divisions) - LoadGen proxy harness + SUT plugin protocol - Compliance checker, autograder, hardware fingerprint - Paper draft (paper.tex) with TikZ/SVG figure sources - Three lab examples + practitioner workflow configs - Workload + dataset YAML registries (single source of truth) Excluded (per mlperf-edu/.gitignore + size constraints): - Datasets (6.6 GB), checkpoints (260 MB), gpt2 weights (523 MB) - Generated PDFs, .venv, build artifacts
328 lines
11 KiB
Python
328 lines
11 KiB
Python
"""
|
|
MLPerf EDU: Micro-BERT — Text Classification Workload
|
|
======================================================
|
|
Provenance: Devlin et al. 2019, "BERT: Pre-training of Deep
|
|
Bidirectional Transformers for Language Understanding"
|
|
Maps to: MLPerf Training BERT
|
|
|
|
This implements a micro-scale bidirectional transformer encoder
|
|
for binary sentiment classification on the **SST-2** dataset
|
|
(Stanford Sentiment Treebank, Socher et al. 2013).
|
|
|
|
Dataset: SST-2 (Stanford Sentiment Treebank — Binary)
|
|
- 67,349 training sentences from movie reviews
|
|
- 872 validation sentences
|
|
- Binary labels: 0 = negative, 1 = positive
|
|
- Ships locally in data/sst2/SST-2/ (3.8 MB)
|
|
- Provenance: Socher et al., EMNLP 2013
|
|
|
|
Pedagogical concepts:
|
|
- Bidirectional self-attention (vs causal in GPT)
|
|
- [CLS] token pooling for classification
|
|
- Position embeddings for sequence ordering
|
|
- Fine-tuning paradigm (pre-train + task head)
|
|
- Real NLU challenge: sarcasm, negation, context
|
|
|
|
Architecture:
|
|
CharEmbed + PosEmbed → TransformerEncoder(2 layers, 4 heads, d=128)
|
|
→ [CLS] pooling → Linear(128, 2)
|
|
|
|
Total: ~0.5M parameters
|
|
Target: Val accuracy > 0.78 (with character-level tokenization)
|
|
"""
|
|
|
|
import os
|
|
import torch
|
|
import torch.nn as nn
|
|
import torch.nn.functional as F
|
|
import numpy as np
|
|
from collections import Counter
|
|
from torch.utils.data import Dataset, DataLoader
|
|
|
|
REPO_ROOT = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", ".."))
|
|
SST2_DIR = os.path.join(REPO_ROOT, "data", "sst2", "SST-2")
|
|
|
|
# Special token indices
|
|
PAD_IDX = 0
|
|
CLS_IDX = 1
|
|
SEP_IDX = 2
|
|
UNK_IDX = 3
|
|
|
|
|
|
def build_vocab(path, max_vocab=5000):
|
|
"""Build word-level vocabulary from SST-2 training data."""
|
|
counter = Counter()
|
|
with open(path, "r", encoding="utf-8") as f:
|
|
f.readline() # skip header
|
|
for line in f:
|
|
parts = line.strip().split("\t")
|
|
if len(parts) >= 2:
|
|
words = parts[0].lower().split()
|
|
counter.update(words)
|
|
# Top max_vocab words + 4 special tokens
|
|
vocab = {"[PAD]": PAD_IDX, "[CLS]": CLS_IDX, "[SEP]": SEP_IDX, "[UNK]": UNK_IDX}
|
|
for word, _ in counter.most_common(max_vocab):
|
|
vocab[word] = len(vocab)
|
|
return vocab
|
|
|
|
|
|
def word_encode(text, vocab, max_len=64):
|
|
"""Encode text as word-level tokens with [CLS] and [SEP]."""
|
|
tokens = [CLS_IDX]
|
|
for word in text.lower().split()[:max_len - 2]:
|
|
tokens.append(vocab.get(word, UNK_IDX))
|
|
tokens.append(SEP_IDX)
|
|
if len(tokens) < max_len:
|
|
tokens += [PAD_IDX] * (max_len - len(tokens))
|
|
return tokens
|
|
|
|
|
|
# ============================================================================
|
|
# SST-2 Dataset — real binary sentiment from Stanford NLP
|
|
# ============================================================================
|
|
|
|
class SST2Dataset(Dataset):
|
|
"""
|
|
SST-2 binary sentiment classification dataset.
|
|
|
|
Loads real movie review sentences from the Stanford Sentiment
|
|
Treebank. Each sentence is a short movie review fragment with
|
|
a binary label (0=negative, 1=positive).
|
|
|
|
Uses word-level whitespace tokenization with a 5K vocabulary built
|
|
from the training data — no external tokenizer dependency needed.
|
|
|
|
Args:
|
|
split: "train" or "dev"
|
|
vocab: Word-to-index dictionary (built from training data)
|
|
max_len: Maximum sequence length
|
|
max_samples: Limit samples for faster training (None = all)
|
|
"""
|
|
|
|
def __init__(self, split="train", vocab=None, max_len=64, max_samples=None):
|
|
super().__init__()
|
|
self.max_len = max_len
|
|
|
|
if split == "train":
|
|
path = os.path.join(SST2_DIR, "train.tsv")
|
|
else:
|
|
path = os.path.join(SST2_DIR, "dev.tsv")
|
|
|
|
if not os.path.exists(path):
|
|
raise FileNotFoundError(
|
|
f"SST-2 dataset not found at {path}. "
|
|
"Download: cd data/sst2 && curl -sL "
|
|
"'https://dl.fbaipublicfiles.com/glue/data/SST-2.zip' "
|
|
"-o SST-2.zip && unzip SST-2.zip"
|
|
)
|
|
|
|
# Build vocab from training data if not provided
|
|
if vocab is None:
|
|
train_path = os.path.join(SST2_DIR, "train.tsv")
|
|
vocab = build_vocab(train_path)
|
|
self.vocab = vocab
|
|
|
|
self.texts = []
|
|
self.labels = []
|
|
|
|
with open(path, "r", encoding="utf-8") as f:
|
|
header = f.readline() # Skip header
|
|
for line in f:
|
|
parts = line.strip().split("\t")
|
|
if len(parts) >= 2:
|
|
sentence = parts[0]
|
|
label = int(parts[1])
|
|
self.texts.append(word_encode(sentence, vocab, max_len))
|
|
self.labels.append(label)
|
|
|
|
if max_samples is not None:
|
|
self.texts = self.texts[:max_samples]
|
|
self.labels = self.labels[:max_samples]
|
|
|
|
self.texts = torch.tensor(self.texts, dtype=torch.long)
|
|
self.labels = torch.tensor(self.labels, dtype=torch.long)
|
|
|
|
def __len__(self):
|
|
return len(self.labels)
|
|
|
|
def __getitem__(self, idx):
|
|
return self.texts[idx], self.labels[idx]
|
|
|
|
|
|
# ============================================================================
|
|
# Micro-BERT Model
|
|
# ============================================================================
|
|
|
|
class MicroBERT(nn.Module):
|
|
"""
|
|
Micro-scale BERT encoder for binary sentiment classification.
|
|
|
|
Key differences from GPT:
|
|
- Bidirectional attention (no causal mask)
|
|
- [CLS] token pooling for classification
|
|
- Smaller scale: 2 layers, 4 heads, d=128
|
|
|
|
Args:
|
|
vocab_size: Vocabulary size (~5004 for word-level)
|
|
d_model: Hidden dimension
|
|
nhead: Number of attention heads
|
|
num_layers: Number of transformer encoder layers
|
|
max_len: Maximum sequence length
|
|
num_classes: Number of classification labels (2 for SST-2)
|
|
"""
|
|
|
|
def __init__(self, vocab_size=5004, d_model=128, nhead=4,
|
|
num_layers=2, max_len=64, num_classes=2, dropout=0.1):
|
|
super().__init__()
|
|
self.d_model = d_model
|
|
|
|
# Embeddings
|
|
self.token_embed = nn.Embedding(vocab_size, d_model, padding_idx=PAD_IDX)
|
|
self.pos_embed = nn.Embedding(max_len, d_model)
|
|
self.embed_dropout = nn.Dropout(dropout)
|
|
self.embed_norm = nn.LayerNorm(d_model)
|
|
|
|
# Transformer encoder (bidirectional — no causal mask!)
|
|
encoder_layer = nn.TransformerEncoderLayer(
|
|
d_model=d_model, nhead=nhead,
|
|
dim_feedforward=d_model * 4,
|
|
dropout=dropout, activation='gelu',
|
|
batch_first=True,
|
|
)
|
|
self.encoder = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)
|
|
|
|
# Classification head: [CLS] → Linear → Softmax
|
|
self.classifier = nn.Sequential(
|
|
nn.Linear(d_model, d_model),
|
|
nn.Tanh(),
|
|
nn.Dropout(dropout),
|
|
nn.Linear(d_model, num_classes),
|
|
)
|
|
|
|
def forward(self, input_ids, targets=None):
|
|
"""
|
|
Args:
|
|
input_ids: Token indices (B, seq_len)
|
|
targets: Optional class labels (B,) for computing loss
|
|
Returns:
|
|
logits: (B, num_classes)
|
|
loss: Cross-entropy loss (if targets provided)
|
|
"""
|
|
B, T = input_ids.shape
|
|
|
|
# Embed tokens + positions
|
|
positions = torch.arange(T, device=input_ids.device).unsqueeze(0)
|
|
x = self.token_embed(input_ids) + self.pos_embed(positions)
|
|
x = self.embed_norm(self.embed_dropout(x))
|
|
|
|
# Create padding mask
|
|
pad_mask = (input_ids == PAD_IDX) # True where padded
|
|
|
|
# Bidirectional transformer encoding
|
|
x = self.encoder(x, src_key_padding_mask=pad_mask)
|
|
|
|
# Pool: take the [CLS] token (position 0)
|
|
cls_output = x[:, 0, :] # (B, d_model)
|
|
|
|
# Classify
|
|
logits = self.classifier(cls_output) # (B, num_classes)
|
|
|
|
loss = None
|
|
if targets is not None:
|
|
loss = F.cross_entropy(logits, targets)
|
|
|
|
return logits, loss
|
|
|
|
|
|
def get_bert_dataloaders(batch_size=64, seed=42, max_train=10000):
|
|
"""
|
|
Create train/val DataLoaders for SST-2 sentiment classification.
|
|
|
|
Uses a 10K subset of training data by default for fast iteration
|
|
(full 67K available by setting max_train=None).
|
|
"""
|
|
# Build vocabulary from full training data
|
|
vocab = build_vocab(os.path.join(SST2_DIR, "train.tsv"))
|
|
train_ds = SST2Dataset(split="train", vocab=vocab, max_samples=max_train)
|
|
val_ds = SST2Dataset(split="dev", vocab=vocab)
|
|
|
|
train_loader = DataLoader(
|
|
train_ds, batch_size=batch_size, shuffle=True,
|
|
generator=torch.Generator().manual_seed(seed),
|
|
)
|
|
val_loader = DataLoader(val_ds, batch_size=batch_size, shuffle=False)
|
|
return train_loader, val_loader, len(vocab)
|
|
|
|
|
|
# Training loop for standalone testing
|
|
def train_and_evaluate(epochs=20, batch_size=64, lr=1e-3, seed=42):
|
|
"""Train MicroBERT on SST-2 and report convergence."""
|
|
torch.manual_seed(seed)
|
|
|
|
train_loader, val_loader, vocab_size = get_bert_dataloaders(
|
|
batch_size=batch_size, seed=seed
|
|
)
|
|
model = MicroBERT(vocab_size=vocab_size)
|
|
optimizer = torch.optim.AdamW(model.parameters(), lr=lr, weight_decay=0.01)
|
|
|
|
print(f" SST-2: {len(train_loader.dataset)} train, "
|
|
f"{len(val_loader.dataset)} val sentences")
|
|
|
|
for epoch in range(epochs):
|
|
model.train()
|
|
train_loss = 0
|
|
correct = 0
|
|
total = 0
|
|
for x, y in train_loader:
|
|
optimizer.zero_grad()
|
|
logits, loss = model(x, targets=y)
|
|
loss.backward()
|
|
optimizer.step()
|
|
train_loss += loss.item()
|
|
correct += (logits.argmax(1) == y).sum().item()
|
|
total += y.size(0)
|
|
train_loss /= len(train_loader)
|
|
train_acc = correct / total
|
|
|
|
model.eval()
|
|
val_loss = 0
|
|
correct = 0
|
|
total = 0
|
|
with torch.no_grad():
|
|
for x, y in val_loader:
|
|
logits, loss = model(x, targets=y)
|
|
val_loss += loss.item()
|
|
correct += (logits.argmax(1) == y).sum().item()
|
|
total += y.size(0)
|
|
val_loss /= len(val_loader)
|
|
val_acc = correct / total
|
|
|
|
if (epoch + 1) % 4 == 0:
|
|
print(f" Epoch {epoch+1:3d}: train_loss={train_loss:.4f} "
|
|
f"train_acc={train_acc:.3f} "
|
|
f"val_loss={val_loss:.4f} val_acc={val_acc:.3f}")
|
|
|
|
n_params = sum(p.numel() for p in model.parameters())
|
|
return {
|
|
"final_train_loss": train_loss,
|
|
"final_val_loss": val_loss,
|
|
"final_train_acc": train_acc,
|
|
"final_val_acc": val_acc,
|
|
"n_params": n_params,
|
|
}
|
|
|
|
|
|
if __name__ == "__main__":
|
|
vocab = build_vocab(os.path.join(SST2_DIR, "train.tsv"))
|
|
model = MicroBERT(vocab_size=len(vocab))
|
|
n_params = sum(p.numel() for p in model.parameters())
|
|
print(f"MicroBERT: {n_params:,} parameters")
|
|
print(f"Vocab: {len(vocab)} (word-level from SST-2)")
|
|
print()
|
|
print("Training on SST-2 sentiment classification...")
|
|
results = train_and_evaluate(epochs=20)
|
|
print(f"\n✅ Results:")
|
|
print(f" Final val accuracy: {results['final_val_acc']:.3f}")
|
|
print(f" Final val loss: {results['final_val_loss']:.4f}")
|
|
print(f" Parameters: {results['n_params']:,}")
|