mirror of
https://github.com/harvard-edge/cs249r_book.git
synced 2026-05-06 01:28:35 -05:00
Snapshot of the standalone /Users/VJ/GitHub/mlperf-edu/ repo as of 2026-04-16, brought into MLSysBook as a parked feature branch for backup and iteration. Not for merge to dev. Contents (88 files, ~2.3 MB): - 16 reference workloads (cloud / edge / tiny / agent divisions) - LoadGen proxy harness + SUT plugin protocol - Compliance checker, autograder, hardware fingerprint - Paper draft (paper.tex) with TikZ/SVG figure sources - Three lab examples + practitioner workflow configs - Workload + dataset YAML registries (single source of truth) Excluded (per mlperf-edu/.gitignore + size constraints): - Datasets (6.6 GB), checkpoints (260 MB), gpt2 weights (523 MB) - Generated PDFs, .venv, build artifacts
34 lines
1.1 KiB
Python
34 lines
1.1 KiB
Python
import torch
|
|
from mlperf_edu.core import Referee
|
|
|
|
def test_referee_catches_early_stop():
|
|
"""Test that a student cannot stop the referee clock without reaching target accuracy."""
|
|
referee = Referee("student_cheater", "vision-baseline", 0.90)
|
|
referee.start_clock()
|
|
|
|
# Student passes fake bad predictions
|
|
preds = torch.tensor([[0.8, 0.2], [0.1, 0.9]])
|
|
targets = torch.tensor([1, 0]) # 0% accuracy
|
|
|
|
acc = referee.evaluate_epoch(preds, targets)
|
|
|
|
assert acc == 0.0
|
|
assert referee.is_done() == False
|
|
assert referee._end_time is None # Clock is still running!
|
|
|
|
def test_referee_validates_success():
|
|
"""Test that the referee successfully generates a receipt when target is hit."""
|
|
referee = Referee("student_honest", "vision-baseline", 0.90)
|
|
referee.start_clock()
|
|
|
|
# Student passes 100% accurate predictions
|
|
preds = torch.tensor([[0.8, 0.2], [0.1, 0.9]])
|
|
targets = torch.tensor([0, 1])
|
|
|
|
acc = referee.evaluate_epoch(preds, targets)
|
|
|
|
assert acc == 1.0
|
|
assert referee.is_done() == True
|
|
assert referee._result is not None
|
|
assert referee._result.passed == True
|