Files
cs249r_book/mlperf-edu/reference/cloud/micro_gnn.py
Vijay Janapa Reddi a9878ad6bd feat: import mlperf-edu pedagogical benchmark suite
Snapshot of the standalone /Users/VJ/GitHub/mlperf-edu/ repo as of
2026-04-16, brought into MLSysBook as a parked feature branch for
backup and iteration. Not for merge to dev.

Contents (88 files, ~2.3 MB):
- 16 reference workloads (cloud / edge / tiny / agent divisions)
- LoadGen proxy harness + SUT plugin protocol
- Compliance checker, autograder, hardware fingerprint
- Paper draft (paper.tex) with TikZ/SVG figure sources
- Three lab examples + practitioner workflow configs
- Workload + dataset YAML registries (single source of truth)

Excluded (per mlperf-edu/.gitignore + size constraints):
- Datasets (6.6 GB), checkpoints (260 MB), gpt2 weights (523 MB)
- Generated PDFs, .venv, build artifacts
2026-04-16 14:15:05 -04:00

348 lines
13 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""
MLPerf EDU: Micro-GCN — Graph Neural Network Workload
======================================================
Provenance: Kipf & Welling 2017, "Semi-Supervised Classification with
Graph Convolutional Networks"
Maps to: MLPerf Training GNN (Graph Neural Networks division)
This implements a minimal Graph Convolutional Network (GCN) for node
classification on the **Cora citation network** — a real, widely-cited
benchmark dataset (2,708 nodes, 5,429 edges, 7 classes).
Dataset: Cora (McCallum et al., 2000; Kipf & Welling 2017)
- 2,708 scientific papers (nodes)
- 5,429 citation links (edges)
- 1,433 binary bag-of-words features per paper
- 7 subject classes: Case_Based, Genetic_Algorithms, Neural_Networks,
Probabilistic_Methods, Reinforcement_Learning, Rule_Learning, Theory
- Standard split: 140 train / 500 val / 1000 test (semi-supervised)
- Ships locally in data/cora/ (168KB compressed)
Pedagogical concepts:
- Message passing: each node aggregates features from neighbors
- Graph convolution: spectral approximation via Chebyshev polynomials
- Over-smoothing: why deeper GNNs lose discriminative power
- Adjacency normalization: D^{-1/2} A D^{-1/2}
- Semi-supervised learning: only 20 labels per class (140 total)
Architecture:
GCNConv(1433, 64) → ReLU → Dropout
→ GCNConv(64, 32) → ReLU → Dropout
→ Linear(32, 7)
Total: ~94K parameters
Target: Test accuracy > 0.78 (production GCN achieves ~81%)
"""
import os
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
from typing import Tuple
REPO_ROOT = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", ".."))
CORA_DIR = os.path.join(REPO_ROOT, "data", "cora", "cora")
# ============================================================================
# Graph Convolution Layer — Pure PyTorch, no PyG / DGL dependency
# ============================================================================
class GCNConv(nn.Module):
"""
Graph Convolutional Layer (Kipf & Welling 2017).
Computes: H' = σ(D̃^{-1/2} Ã D̃^{-1/2} H W)
where à = A + I (self-loops), D̃ = degree matrix of Ã
This is equivalent to a 1-hop neighborhood aggregation:
each node's new features = learned linear combination of its
neighbors' features (including itself).
Args:
in_features: Input feature dimension
out_features: Output feature dimension
"""
def __init__(self, in_features, out_features):
super().__init__()
self.weight = nn.Parameter(torch.empty(in_features, out_features))
self.bias = nn.Parameter(torch.zeros(out_features))
nn.init.xavier_uniform_(self.weight)
def forward(self, x, adj_norm):
"""
Args:
x: Node features (N, in_features)
adj_norm: Normalized adjacency matrix D̃^{-1/2} Ã D̃^{-1/2} (N, N)
Returns:
Updated node features (N, out_features)
"""
# Step 1: Linear transform features
support = torch.mm(x, self.weight) # (N, out_features)
# Step 2: Aggregate from neighbors via normalized adjacency
output = torch.spmm(adj_norm, support) # (N, out_features)
return output + self.bias
class MicroGCN(nn.Module):
"""
2-layer GCN for node classification on the Cora dataset.
Architecture:
Input features → GCNConv(hidden) → ReLU → Dropout
→ GCNConv(hidden2) → ReLU → Dropout
→ Linear(num_classes)
Args:
nfeat: Number of input features per node (1433 for Cora)
nhid: Hidden dimension (default 64)
nclass: Number of output classes (7 for Cora)
dropout: Dropout rate
Exercise: Over-smoothing Analysis
Try adding more GCN layers (3, 4, 5) and measure test accuracy.
You should observe accuracy degradation because GCNs suffer from
"over-smoothing": after too many message-passing rounds, all node
representations converge to the same vector (Laplacian smoothing).
This is a fundamental limitation of spectral GNNs and motivates
architectures like GAT, GraphSAGE, or techniques like residual
connections and jumping knowledge.
"""
def __init__(self, nfeat=1433, nhid=64, nclass=7, dropout=0.5):
super().__init__()
self.gc1 = GCNConv(nfeat, nhid)
self.gc2 = GCNConv(nhid, nhid // 2)
self.classifier = nn.Linear(nhid // 2, nclass)
self.dropout = dropout
def forward(self, x, adj_norm):
"""
Args:
x: Node features (N, nfeat)
adj_norm: Normalized adjacency (N, N) sparse or dense
Returns:
log_softmax over classes (N, nclass)
"""
x = F.relu(self.gc1(x, adj_norm))
x = F.dropout(x, self.dropout, training=self.training)
x = F.relu(self.gc2(x, adj_norm))
x = F.dropout(x, self.dropout, training=self.training)
x = self.classifier(x)
return F.log_softmax(x, dim=1)
# ============================================================================
# Cora Dataset Loader — real citation network, shipped locally
# ============================================================================
CLASS_NAMES = [
"Case_Based", "Genetic_Algorithms", "Neural_Networks",
"Probabilistic_Methods", "Reinforcement_Learning",
"Rule_Learning", "Theory"
]
def load_cora(data_dir=None):
"""
Load the Cora citation network from local files.
Data format:
cora.content: <paper_id> <1433 binary features> <class_label>
cora.cites: <cited_paper_id> <citing_paper_id>
Returns:
dict with keys: x, y, adj_norm, train_mask, val_mask, test_mask,
n_classes, n_features, class_names
"""
if data_dir is None:
data_dir = CORA_DIR
content_path = os.path.join(data_dir, "cora.content")
cites_path = os.path.join(data_dir, "cora.cites")
if not os.path.exists(content_path):
raise FileNotFoundError(
f"Cora dataset not found at {data_dir}. "
"Download: cd data/cora && curl -sL "
"'https://linqs-data.soe.ucsc.edu/public/lbc/cora.tgz' | tar xz"
)
# ── Parse nodes ──────────────────────────────────────────────────────
# Each line: paper_id feat_1 feat_2 ... feat_1433 class_label
paper_ids = []
features_list = []
labels_list = []
label_to_idx = {name: i for i, name in enumerate(CLASS_NAMES)}
with open(content_path, "r") as f:
for line in f:
parts = line.strip().split("\t")
paper_id = int(parts[0])
feats = list(map(float, parts[1:-1]))
label_str = parts[-1]
paper_ids.append(paper_id)
features_list.append(feats)
labels_list.append(label_to_idx[label_str])
n_nodes = len(paper_ids)
id_to_idx = {pid: i for i, pid in enumerate(paper_ids)}
features = torch.tensor(features_list, dtype=torch.float32) # (2708, 1433)
labels = torch.tensor(labels_list, dtype=torch.long) # (2708,)
# ── Parse edges ──────────────────────────────────────────────────────
edges_src, edges_dst = [], []
with open(cites_path, "r") as f:
for line in f:
parts = line.strip().split("\t")
if len(parts) != 2:
continue
cited = int(parts[0])
citing = int(parts[1])
if cited in id_to_idx and citing in id_to_idx:
src = id_to_idx[cited]
dst = id_to_idx[citing]
# Undirected
edges_src.extend([src, dst])
edges_dst.extend([dst, src])
# ── Build adjacency matrix with self-loops ───────────────────────────
n = n_nodes
indices = torch.tensor([edges_src, edges_dst], dtype=torch.long)
values = torch.ones(len(edges_src), dtype=torch.float32)
adj = torch.sparse_coo_tensor(indices, values, size=(n, n))
# Add self-loops
self_loop = torch.sparse_coo_tensor(
torch.arange(n).unsqueeze(0).repeat(2, 1),
torch.ones(n), size=(n, n)
)
adj = (adj + self_loop).coalesce()
# ── Normalize: D^{-1/2} A D^{-1/2} ──────────────────────────────────
adj_dense = adj.to_dense()
# Clamp to binary (remove duplicate edges)
adj_dense = adj_dense.clamp(max=1.0)
degree = adj_dense.sum(dim=1)
d_inv_sqrt = torch.pow(degree, -0.5)
d_inv_sqrt[torch.isinf(d_inv_sqrt)] = 0.0
D = torch.diag(d_inv_sqrt)
adj_norm = D @ adj_dense @ D
adj_norm_sparse = adj_norm.to_sparse()
# ── Standard Cora split (Kipf & Welling convention) ──────────────────
# 20 nodes per class for training = 140 total (semi-supervised)
# 500 for validation, 1000 for test
rng = np.random.RandomState(42)
train_mask = torch.zeros(n_nodes, dtype=torch.bool)
val_mask = torch.zeros(n_nodes, dtype=torch.bool)
test_mask = torch.zeros(n_nodes, dtype=torch.bool)
# Select 20 labeled nodes per class for training
for c in range(len(CLASS_NAMES)):
class_indices = (labels == c).nonzero(as_tuple=True)[0].numpy()
rng.shuffle(class_indices)
train_mask[class_indices[:20]] = True
# Remaining nodes: 500 val, 1000 test
remaining = (~train_mask).nonzero(as_tuple=True)[0].numpy()
rng.shuffle(remaining)
val_mask[remaining[:500]] = True
test_mask[remaining[500:1500]] = True
return {
"x": features,
"y": labels,
"adj_norm": adj_norm_sparse,
"train_mask": train_mask,
"val_mask": val_mask,
"test_mask": test_mask,
"n_classes": len(CLASS_NAMES),
"n_features": features.shape[1],
"class_names": CLASS_NAMES,
"n_nodes": n_nodes,
"n_edges": len(edges_src) // 2,
}
def get_gnn_dataloaders(seed=42, **kwargs):
"""
Returns the Cora graph data dict (GNNs don't use standard DataLoaders).
Compatible with the dataset_factory interface.
"""
return load_cora()
# ============================================================================
# Training loop for standalone testing
# ============================================================================
def train_and_evaluate(epochs=200, lr=0.01, seed=42):
"""Training loop on Cora — targets ~78-81% test accuracy."""
data = load_cora()
print(f" Cora dataset: {data['n_nodes']} nodes, {data['n_edges']} edges")
print(f" Features: {data['n_features']}, Classes: {data['n_classes']}")
print(f" Train: {data['train_mask'].sum()}, Val: {data['val_mask'].sum()}, "
f"Test: {data['test_mask'].sum()}")
model = MicroGCN(nfeat=data["n_features"], nclass=data["n_classes"])
optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=5e-4)
best_val_acc = 0
for epoch in range(epochs):
model.train()
optimizer.zero_grad()
out = model(data["x"], data["adj_norm"])
loss = F.nll_loss(out[data["train_mask"]], data["y"][data["train_mask"]])
loss.backward()
optimizer.step()
# Evaluate
model.eval()
with torch.no_grad():
out = model(data["x"], data["adj_norm"])
val_pred = out[data["val_mask"]].argmax(dim=1)
val_acc = (val_pred == data["y"][data["val_mask"]]).float().mean().item()
if val_acc > best_val_acc:
best_val_acc = val_acc
if (epoch + 1) % 25 == 0:
print(f" Epoch {epoch+1:3d}: loss={loss.item():.4f} val_acc={val_acc:.3f}")
# Test
model.eval()
with torch.no_grad():
out = model(data["x"], data["adj_norm"])
test_pred = out[data["test_mask"]].argmax(dim=1)
test_acc = (test_pred == data["y"][data["test_mask"]]).float().mean().item()
return {
"final_loss": loss.item(),
"best_val_acc": best_val_acc,
"test_acc": test_acc,
"n_params": sum(p.numel() for p in model.parameters()),
}
if __name__ == "__main__":
model = MicroGCN(nfeat=1433, nclass=7)
n_params = sum(p.numel() for p in model.parameters())
print(f"MicroGCN: {n_params:,} parameters")
print()
print("Training on Cora citation network...")
results = train_and_evaluate(epochs=200)
print(f"\n✅ Results:")
print(f" Test accuracy: {results['test_acc']:.3f}")
print(f" Best val accuracy: {results['best_val_acc']:.3f}")
print(f" Final loss: {results['final_loss']:.4f}")
print(f" Parameters: {results['n_params']:,}")