mirror of
https://github.com/harvard-edge/cs249r_book.git
synced 2026-05-08 18:01:20 -05:00
Snapshot of the standalone /Users/VJ/GitHub/mlperf-edu/ repo as of 2026-04-16, brought into MLSysBook as a parked feature branch for backup and iteration. Not for merge to dev. Contents (88 files, ~2.3 MB): - 16 reference workloads (cloud / edge / tiny / agent divisions) - LoadGen proxy harness + SUT plugin protocol - Compliance checker, autograder, hardware fingerprint - Paper draft (paper.tex) with TikZ/SVG figure sources - Three lab examples + practitioner workflow configs - Workload + dataset YAML registries (single source of truth) Excluded (per mlperf-edu/.gitignore + size constraints): - Datasets (6.6 GB), checkpoints (260 MB), gpt2 weights (523 MB) - Generated PDFs, .venv, build artifacts
348 lines
13 KiB
Python
348 lines
13 KiB
Python
"""
|
||
MLPerf EDU: Micro-GCN — Graph Neural Network Workload
|
||
======================================================
|
||
Provenance: Kipf & Welling 2017, "Semi-Supervised Classification with
|
||
Graph Convolutional Networks"
|
||
Maps to: MLPerf Training GNN (Graph Neural Networks division)
|
||
|
||
This implements a minimal Graph Convolutional Network (GCN) for node
|
||
classification on the **Cora citation network** — a real, widely-cited
|
||
benchmark dataset (2,708 nodes, 5,429 edges, 7 classes).
|
||
|
||
Dataset: Cora (McCallum et al., 2000; Kipf & Welling 2017)
|
||
- 2,708 scientific papers (nodes)
|
||
- 5,429 citation links (edges)
|
||
- 1,433 binary bag-of-words features per paper
|
||
- 7 subject classes: Case_Based, Genetic_Algorithms, Neural_Networks,
|
||
Probabilistic_Methods, Reinforcement_Learning, Rule_Learning, Theory
|
||
- Standard split: 140 train / 500 val / 1000 test (semi-supervised)
|
||
- Ships locally in data/cora/ (168KB compressed)
|
||
|
||
Pedagogical concepts:
|
||
- Message passing: each node aggregates features from neighbors
|
||
- Graph convolution: spectral approximation via Chebyshev polynomials
|
||
- Over-smoothing: why deeper GNNs lose discriminative power
|
||
- Adjacency normalization: D^{-1/2} A D^{-1/2}
|
||
- Semi-supervised learning: only 20 labels per class (140 total)
|
||
|
||
Architecture:
|
||
GCNConv(1433, 64) → ReLU → Dropout
|
||
→ GCNConv(64, 32) → ReLU → Dropout
|
||
→ Linear(32, 7)
|
||
|
||
Total: ~94K parameters
|
||
Target: Test accuracy > 0.78 (production GCN achieves ~81%)
|
||
"""
|
||
|
||
import os
|
||
import torch
|
||
import torch.nn as nn
|
||
import torch.nn.functional as F
|
||
import numpy as np
|
||
from typing import Tuple
|
||
|
||
|
||
REPO_ROOT = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", ".."))
|
||
CORA_DIR = os.path.join(REPO_ROOT, "data", "cora", "cora")
|
||
|
||
|
||
# ============================================================================
|
||
# Graph Convolution Layer — Pure PyTorch, no PyG / DGL dependency
|
||
# ============================================================================
|
||
|
||
class GCNConv(nn.Module):
|
||
"""
|
||
Graph Convolutional Layer (Kipf & Welling 2017).
|
||
|
||
Computes: H' = σ(D̃^{-1/2} Ã D̃^{-1/2} H W)
|
||
where à = A + I (self-loops), D̃ = degree matrix of Ã
|
||
|
||
This is equivalent to a 1-hop neighborhood aggregation:
|
||
each node's new features = learned linear combination of its
|
||
neighbors' features (including itself).
|
||
|
||
Args:
|
||
in_features: Input feature dimension
|
||
out_features: Output feature dimension
|
||
"""
|
||
|
||
def __init__(self, in_features, out_features):
|
||
super().__init__()
|
||
self.weight = nn.Parameter(torch.empty(in_features, out_features))
|
||
self.bias = nn.Parameter(torch.zeros(out_features))
|
||
nn.init.xavier_uniform_(self.weight)
|
||
|
||
def forward(self, x, adj_norm):
|
||
"""
|
||
Args:
|
||
x: Node features (N, in_features)
|
||
adj_norm: Normalized adjacency matrix D̃^{-1/2} Ã D̃^{-1/2} (N, N)
|
||
Returns:
|
||
Updated node features (N, out_features)
|
||
"""
|
||
# Step 1: Linear transform features
|
||
support = torch.mm(x, self.weight) # (N, out_features)
|
||
# Step 2: Aggregate from neighbors via normalized adjacency
|
||
output = torch.spmm(adj_norm, support) # (N, out_features)
|
||
return output + self.bias
|
||
|
||
|
||
class MicroGCN(nn.Module):
|
||
"""
|
||
2-layer GCN for node classification on the Cora dataset.
|
||
|
||
Architecture:
|
||
Input features → GCNConv(hidden) → ReLU → Dropout
|
||
→ GCNConv(hidden2) → ReLU → Dropout
|
||
→ Linear(num_classes)
|
||
|
||
Args:
|
||
nfeat: Number of input features per node (1433 for Cora)
|
||
nhid: Hidden dimension (default 64)
|
||
nclass: Number of output classes (7 for Cora)
|
||
dropout: Dropout rate
|
||
|
||
Exercise: Over-smoothing Analysis
|
||
Try adding more GCN layers (3, 4, 5) and measure test accuracy.
|
||
You should observe accuracy degradation because GCNs suffer from
|
||
"over-smoothing": after too many message-passing rounds, all node
|
||
representations converge to the same vector (Laplacian smoothing).
|
||
This is a fundamental limitation of spectral GNNs and motivates
|
||
architectures like GAT, GraphSAGE, or techniques like residual
|
||
connections and jumping knowledge.
|
||
"""
|
||
|
||
def __init__(self, nfeat=1433, nhid=64, nclass=7, dropout=0.5):
|
||
super().__init__()
|
||
self.gc1 = GCNConv(nfeat, nhid)
|
||
self.gc2 = GCNConv(nhid, nhid // 2)
|
||
self.classifier = nn.Linear(nhid // 2, nclass)
|
||
self.dropout = dropout
|
||
|
||
def forward(self, x, adj_norm):
|
||
"""
|
||
Args:
|
||
x: Node features (N, nfeat)
|
||
adj_norm: Normalized adjacency (N, N) sparse or dense
|
||
Returns:
|
||
log_softmax over classes (N, nclass)
|
||
"""
|
||
x = F.relu(self.gc1(x, adj_norm))
|
||
x = F.dropout(x, self.dropout, training=self.training)
|
||
x = F.relu(self.gc2(x, adj_norm))
|
||
x = F.dropout(x, self.dropout, training=self.training)
|
||
x = self.classifier(x)
|
||
return F.log_softmax(x, dim=1)
|
||
|
||
|
||
# ============================================================================
|
||
# Cora Dataset Loader — real citation network, shipped locally
|
||
# ============================================================================
|
||
|
||
CLASS_NAMES = [
|
||
"Case_Based", "Genetic_Algorithms", "Neural_Networks",
|
||
"Probabilistic_Methods", "Reinforcement_Learning",
|
||
"Rule_Learning", "Theory"
|
||
]
|
||
|
||
|
||
def load_cora(data_dir=None):
|
||
"""
|
||
Load the Cora citation network from local files.
|
||
|
||
Data format:
|
||
cora.content: <paper_id> <1433 binary features> <class_label>
|
||
cora.cites: <cited_paper_id> <citing_paper_id>
|
||
|
||
Returns:
|
||
dict with keys: x, y, adj_norm, train_mask, val_mask, test_mask,
|
||
n_classes, n_features, class_names
|
||
"""
|
||
if data_dir is None:
|
||
data_dir = CORA_DIR
|
||
|
||
content_path = os.path.join(data_dir, "cora.content")
|
||
cites_path = os.path.join(data_dir, "cora.cites")
|
||
|
||
if not os.path.exists(content_path):
|
||
raise FileNotFoundError(
|
||
f"Cora dataset not found at {data_dir}. "
|
||
"Download: cd data/cora && curl -sL "
|
||
"'https://linqs-data.soe.ucsc.edu/public/lbc/cora.tgz' | tar xz"
|
||
)
|
||
|
||
# ── Parse nodes ──────────────────────────────────────────────────────
|
||
# Each line: paper_id feat_1 feat_2 ... feat_1433 class_label
|
||
paper_ids = []
|
||
features_list = []
|
||
labels_list = []
|
||
|
||
label_to_idx = {name: i for i, name in enumerate(CLASS_NAMES)}
|
||
|
||
with open(content_path, "r") as f:
|
||
for line in f:
|
||
parts = line.strip().split("\t")
|
||
paper_id = int(parts[0])
|
||
feats = list(map(float, parts[1:-1]))
|
||
label_str = parts[-1]
|
||
|
||
paper_ids.append(paper_id)
|
||
features_list.append(feats)
|
||
labels_list.append(label_to_idx[label_str])
|
||
|
||
n_nodes = len(paper_ids)
|
||
id_to_idx = {pid: i for i, pid in enumerate(paper_ids)}
|
||
|
||
features = torch.tensor(features_list, dtype=torch.float32) # (2708, 1433)
|
||
labels = torch.tensor(labels_list, dtype=torch.long) # (2708,)
|
||
|
||
# ── Parse edges ──────────────────────────────────────────────────────
|
||
edges_src, edges_dst = [], []
|
||
with open(cites_path, "r") as f:
|
||
for line in f:
|
||
parts = line.strip().split("\t")
|
||
if len(parts) != 2:
|
||
continue
|
||
cited = int(parts[0])
|
||
citing = int(parts[1])
|
||
if cited in id_to_idx and citing in id_to_idx:
|
||
src = id_to_idx[cited]
|
||
dst = id_to_idx[citing]
|
||
# Undirected
|
||
edges_src.extend([src, dst])
|
||
edges_dst.extend([dst, src])
|
||
|
||
# ── Build adjacency matrix with self-loops ───────────────────────────
|
||
n = n_nodes
|
||
indices = torch.tensor([edges_src, edges_dst], dtype=torch.long)
|
||
values = torch.ones(len(edges_src), dtype=torch.float32)
|
||
adj = torch.sparse_coo_tensor(indices, values, size=(n, n))
|
||
|
||
# Add self-loops
|
||
self_loop = torch.sparse_coo_tensor(
|
||
torch.arange(n).unsqueeze(0).repeat(2, 1),
|
||
torch.ones(n), size=(n, n)
|
||
)
|
||
adj = (adj + self_loop).coalesce()
|
||
|
||
# ── Normalize: D^{-1/2} A D^{-1/2} ──────────────────────────────────
|
||
adj_dense = adj.to_dense()
|
||
# Clamp to binary (remove duplicate edges)
|
||
adj_dense = adj_dense.clamp(max=1.0)
|
||
degree = adj_dense.sum(dim=1)
|
||
d_inv_sqrt = torch.pow(degree, -0.5)
|
||
d_inv_sqrt[torch.isinf(d_inv_sqrt)] = 0.0
|
||
D = torch.diag(d_inv_sqrt)
|
||
adj_norm = D @ adj_dense @ D
|
||
adj_norm_sparse = adj_norm.to_sparse()
|
||
|
||
# ── Standard Cora split (Kipf & Welling convention) ──────────────────
|
||
# 20 nodes per class for training = 140 total (semi-supervised)
|
||
# 500 for validation, 1000 for test
|
||
rng = np.random.RandomState(42)
|
||
|
||
train_mask = torch.zeros(n_nodes, dtype=torch.bool)
|
||
val_mask = torch.zeros(n_nodes, dtype=torch.bool)
|
||
test_mask = torch.zeros(n_nodes, dtype=torch.bool)
|
||
|
||
# Select 20 labeled nodes per class for training
|
||
for c in range(len(CLASS_NAMES)):
|
||
class_indices = (labels == c).nonzero(as_tuple=True)[0].numpy()
|
||
rng.shuffle(class_indices)
|
||
train_mask[class_indices[:20]] = True
|
||
|
||
# Remaining nodes: 500 val, 1000 test
|
||
remaining = (~train_mask).nonzero(as_tuple=True)[0].numpy()
|
||
rng.shuffle(remaining)
|
||
val_mask[remaining[:500]] = True
|
||
test_mask[remaining[500:1500]] = True
|
||
|
||
return {
|
||
"x": features,
|
||
"y": labels,
|
||
"adj_norm": adj_norm_sparse,
|
||
"train_mask": train_mask,
|
||
"val_mask": val_mask,
|
||
"test_mask": test_mask,
|
||
"n_classes": len(CLASS_NAMES),
|
||
"n_features": features.shape[1],
|
||
"class_names": CLASS_NAMES,
|
||
"n_nodes": n_nodes,
|
||
"n_edges": len(edges_src) // 2,
|
||
}
|
||
|
||
|
||
def get_gnn_dataloaders(seed=42, **kwargs):
|
||
"""
|
||
Returns the Cora graph data dict (GNNs don't use standard DataLoaders).
|
||
Compatible with the dataset_factory interface.
|
||
"""
|
||
return load_cora()
|
||
|
||
|
||
# ============================================================================
|
||
# Training loop for standalone testing
|
||
# ============================================================================
|
||
|
||
def train_and_evaluate(epochs=200, lr=0.01, seed=42):
|
||
"""Training loop on Cora — targets ~78-81% test accuracy."""
|
||
data = load_cora()
|
||
|
||
print(f" Cora dataset: {data['n_nodes']} nodes, {data['n_edges']} edges")
|
||
print(f" Features: {data['n_features']}, Classes: {data['n_classes']}")
|
||
print(f" Train: {data['train_mask'].sum()}, Val: {data['val_mask'].sum()}, "
|
||
f"Test: {data['test_mask'].sum()}")
|
||
|
||
model = MicroGCN(nfeat=data["n_features"], nclass=data["n_classes"])
|
||
optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=5e-4)
|
||
|
||
best_val_acc = 0
|
||
for epoch in range(epochs):
|
||
model.train()
|
||
optimizer.zero_grad()
|
||
out = model(data["x"], data["adj_norm"])
|
||
loss = F.nll_loss(out[data["train_mask"]], data["y"][data["train_mask"]])
|
||
loss.backward()
|
||
optimizer.step()
|
||
|
||
# Evaluate
|
||
model.eval()
|
||
with torch.no_grad():
|
||
out = model(data["x"], data["adj_norm"])
|
||
val_pred = out[data["val_mask"]].argmax(dim=1)
|
||
val_acc = (val_pred == data["y"][data["val_mask"]]).float().mean().item()
|
||
|
||
if val_acc > best_val_acc:
|
||
best_val_acc = val_acc
|
||
|
||
if (epoch + 1) % 25 == 0:
|
||
print(f" Epoch {epoch+1:3d}: loss={loss.item():.4f} val_acc={val_acc:.3f}")
|
||
|
||
# Test
|
||
model.eval()
|
||
with torch.no_grad():
|
||
out = model(data["x"], data["adj_norm"])
|
||
test_pred = out[data["test_mask"]].argmax(dim=1)
|
||
test_acc = (test_pred == data["y"][data["test_mask"]]).float().mean().item()
|
||
|
||
return {
|
||
"final_loss": loss.item(),
|
||
"best_val_acc": best_val_acc,
|
||
"test_acc": test_acc,
|
||
"n_params": sum(p.numel() for p in model.parameters()),
|
||
}
|
||
|
||
|
||
if __name__ == "__main__":
|
||
model = MicroGCN(nfeat=1433, nclass=7)
|
||
n_params = sum(p.numel() for p in model.parameters())
|
||
print(f"MicroGCN: {n_params:,} parameters")
|
||
print()
|
||
print("Training on Cora citation network...")
|
||
results = train_and_evaluate(epochs=200)
|
||
print(f"\n✅ Results:")
|
||
print(f" Test accuracy: {results['test_acc']:.3f}")
|
||
print(f" Best val accuracy: {results['best_val_acc']:.3f}")
|
||
print(f" Final loss: {results['final_loss']:.4f}")
|
||
print(f" Parameters: {results['n_params']:,}")
|