mirror of
https://github.com/harvard-edge/cs249r_book.git
synced 2026-05-07 10:08:50 -05:00
Snapshot of the standalone /Users/VJ/GitHub/mlperf-edu/ repo as of 2026-04-16, brought into MLSysBook as a parked feature branch for backup and iteration. Not for merge to dev. Contents (88 files, ~2.3 MB): - 16 reference workloads (cloud / edge / tiny / agent divisions) - LoadGen proxy harness + SUT plugin protocol - Compliance checker, autograder, hardware fingerprint - Paper draft (paper.tex) with TikZ/SVG figure sources - Three lab examples + practitioner workflow configs - Workload + dataset YAML registries (single source of truth) Excluded (per mlperf-edu/.gitignore + size constraints): - Datasets (6.6 GB), checkpoints (260 MB), gpt2 weights (523 MB) - Generated PDFs, .venv, build artifacts
54 lines
1.9 KiB
Python
54 lines
1.9 KiB
Python
import torch
|
|
import torch.nn as nn
|
|
from rich.console import Console
|
|
|
|
console = Console()
|
|
|
|
class PedagogicalMobileBERT(nn.Module):
|
|
def __init__(self, vocab_size=30522, d_model=128, inter_model=512):
|
|
super().__init__()
|
|
# MobileBERT compresses the input severely compared to BERT base using Bottleneck dimensions
|
|
self.embedding = nn.Embedding(vocab_size, d_model)
|
|
|
|
# We wrap this layer specifically to showcase PyTorch INT8 Dynamic Quantization
|
|
self.encoder = nn.Linear(d_model, inter_model)
|
|
self.activation = nn.ReLU()
|
|
self.decoder = nn.Linear(inter_model, d_model)
|
|
|
|
self.qa_outputs = nn.Linear(d_model, 2)
|
|
|
|
def forward(self, input_ids):
|
|
x = self.embedding(input_ids)
|
|
x = self.encoder(x)
|
|
x = self.activation(x)
|
|
x = self.decoder(x)
|
|
return self.qa_outputs(x)
|
|
|
|
def run_benchmark(provd_path: str, scenario: str):
|
|
"""
|
|
MobileBERT execution mapping Mobile INT8 quant execution latency.
|
|
"""
|
|
console.print("[Mobile:Infer] 🤳 Instantiating PyTorch MobileBERT from provenance...")
|
|
|
|
model = PedagogicalMobileBERT()
|
|
model.eval()
|
|
|
|
# Ensure cross-platform execution (M1 Macs use qnnpack, x86 uses fbgemm)
|
|
if 'qnnpack' in torch.backends.quantized.supported_engines:
|
|
torch.backends.quantized.engine = 'qnnpack'
|
|
|
|
# 💥 Dynamic INT8 Quantization Step (Pedagogical Core!)
|
|
quantized_model = torch.ao.quantization.quantize_dynamic(
|
|
model, {nn.Linear}, dtype=torch.qint8
|
|
)
|
|
console.print("[Mobile:Infer] ⚡ MobileBERT dynamically quantized to INT8.")
|
|
|
|
batch_size = 1
|
|
seq_length = 64
|
|
input_ids = torch.randint(0, 30522, (batch_size, seq_length))
|
|
|
|
with torch.no_grad():
|
|
out = quantized_model(input_ids)
|
|
|
|
console.print(f"[Mobile:Infer] ✅ Quantized forward pass successful. Logits Shape: {out.shape}")
|