mirror of
https://github.com/harvard-edge/cs249r_book.git
synced 2026-05-06 09:38:33 -05:00
Snapshot of the standalone /Users/VJ/GitHub/mlperf-edu/ repo as of 2026-04-16, brought into MLSysBook as a parked feature branch for backup and iteration. Not for merge to dev. Contents (88 files, ~2.3 MB): - 16 reference workloads (cloud / edge / tiny / agent divisions) - LoadGen proxy harness + SUT plugin protocol - Compliance checker, autograder, hardware fingerprint - Paper draft (paper.tex) with TikZ/SVG figure sources - Three lab examples + practitioner workflow configs - Workload + dataset YAML registries (single source of truth) Excluded (per mlperf-edu/.gitignore + size constraints): - Datasets (6.6 GB), checkpoints (260 MB), gpt2 weights (523 MB) - Generated PDFs, .venv, build artifacts
28 lines
764 B
Bash
28 lines
764 B
Bash
#!/bin/bash
|
|
set -e
|
|
|
|
# setup_micro_datasets.sh
|
|
# Bootstraps the MLPerf EDU environment by ensuring real-data shards are present.
|
|
|
|
DATA_DIR="$HOME/.mlperf_edu/data"
|
|
mkdir -p "$DATA_DIR"
|
|
|
|
echo "🛠️ Initializing MLPerf EDU Real-Data Shards..."
|
|
|
|
# Shard list
|
|
SHARDS=("cifar10_micro" "speech_commands_micro")
|
|
BASE_URL="https://raw.githubusercontent.com/MLSysBook/mlperf-edu-data/main/shards"
|
|
|
|
for SHARD in "${SHARDS[@]}"; do
|
|
FILE="$DATA_DIR/$SHARD.npz"
|
|
if [ ! -f "$FILE" ]; then
|
|
echo "📥 Fetching $SHARD..."
|
|
curl -L "$BASE_URL/$SHARD.npz" -o "$FILE" --silent
|
|
echo "✅ Shard $SHARD ready."
|
|
else
|
|
echo "✅ Shard $SHARD already exists in $DATA_DIR."
|
|
fi
|
|
done
|
|
|
|
echo "🚀 All educational shards are ready for training!"
|