mirror of
https://github.com/harvard-edge/cs249r_book.git
synced 2026-05-07 18:18:42 -05:00
Snapshot of the standalone /Users/VJ/GitHub/mlperf-edu/ repo as of 2026-04-16, brought into MLSysBook as a parked feature branch for backup and iteration. Not for merge to dev. Contents (88 files, ~2.3 MB): - 16 reference workloads (cloud / edge / tiny / agent divisions) - LoadGen proxy harness + SUT plugin protocol - Compliance checker, autograder, hardware fingerprint - Paper draft (paper.tex) with TikZ/SVG figure sources - Three lab examples + practitioner workflow configs - Workload + dataset YAML registries (single source of truth) Excluded (per mlperf-edu/.gitignore + size constraints): - Datasets (6.6 GB), checkpoints (260 MB), gpt2 weights (523 MB) - Generated PDFs, .venv, build artifacts
78 lines
2.7 KiB
YAML
78 lines
2.7 KiB
YAML
# The Universal Dataset Pedagogy Matrix
|
|
# This ensures a totally isolated environment mapping identical configurations securely without
|
|
# crashing local student harddrives via extreme canonical 500GB drops.
|
|
|
|
datasets:
|
|
wikitext:
|
|
description: "Language Modeling Token Set"
|
|
uri: "https://raw.githubusercontent.com/pytorch/examples/master/word_language_model/data/wikitext-2/train.txt"
|
|
estimated_size_mb: 12.0
|
|
split: "train"
|
|
|
|
criteo-1tb:
|
|
description: "Pedagogical Sparse Click-Through Subset"
|
|
# To teach students embedding scale without downloading 1TB, we use a custom synthetic dummy map script instead.
|
|
uri: "synthetic_generator:sparse_criteo_1M"
|
|
estimated_size_mb: 50.0
|
|
split: "synthetic"
|
|
|
|
tinyshakespeare:
|
|
description: "Closed-Loop dense semantic grammar bounding Micro LLMs analytically"
|
|
uri: "https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt"
|
|
estimated_size_mb: 1.0
|
|
split: "train"
|
|
|
|
cifar10:
|
|
description: "Native Vision Generator Bounds (Micro-Diffusion Targets natively)"
|
|
uri: "torchvision://cifar10"
|
|
estimated_size_mb: 161.0
|
|
split: "train"
|
|
|
|
cifar100:
|
|
description: "Native Vision Scaling"
|
|
uri: "torchvision://cifar100"
|
|
estimated_size_mb: 161.0
|
|
split: "train/test"
|
|
|
|
squad:
|
|
description: "Stanford Question Answering Dataset v1.1"
|
|
uri: "https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v1.1.json"
|
|
estimated_size_mb: 4.8
|
|
split: "dev"
|
|
|
|
ms-marco-v2:
|
|
description: "Retrieval Vector Architecture Passages"
|
|
uri: "https://msmarco.blob.core.windows.net/msmarcoranking/collectionandqueries.tar.gz"
|
|
estimated_size_mb: 1000.0 # Caution for laptops
|
|
split: "dev"
|
|
|
|
coco:
|
|
description: "MS-COCO Bounding Box Validator (Mobile Object Detection bounds)"
|
|
uri: "http://images.cocodataset.org/annotations/annotations_trainval2017.zip"
|
|
estimated_size_mb: 241.0
|
|
split: "val"
|
|
|
|
squad_mini:
|
|
description: "INT8 NLP Verification Track Limit"
|
|
uri: "https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v1.1.json"
|
|
estimated_size_mb: 4.8
|
|
split: "dev_sampled"
|
|
|
|
openassistant:
|
|
description: "Chatbot Conversational Prompts (4-Bit AWQ emulation)"
|
|
uri: "https://huggingface.co/datasets/OpenAssistant/oasst1"
|
|
estimated_size_mb: 35.0
|
|
split: "validation"
|
|
|
|
speech_commands:
|
|
description: "Audio Keyword Spotting (Micro-Memory Scale)"
|
|
uri: "http://download.tensorflow.org/data/speech_commands_v0.02.tar.gz"
|
|
estimated_size_mb: 2400.0
|
|
split: "test"
|
|
|
|
toyadmos:
|
|
description: "Acoustic Anomaly Detection Engine"
|
|
uri: "https://zenodo.org/record/3239851/files/ToyADMOS.zip"
|
|
estimated_size_mb: 1500.0
|
|
split: "test"
|