mirror of
https://github.com/harvard-edge/cs249r_book.git
synced 2026-05-07 02:03:55 -05:00
841 lines
36 KiB
YAML
841 lines
36 KiB
YAML
suites:
|
|
cloud:
|
|
nanogpt-train:
|
|
model: nanogpt-12m
|
|
params: 11.1M
|
|
params_note: Char-level vocab=128 (TinyShakespeare uses 65 unique chars). Earlier README/paper figures of 85.9M and
|
|
124.4M reflected a vocab=50,257 BPE config that was never reachable from the char-level data; iteration 1 reconciled
|
|
the model to its actual effective size.
|
|
dataset: tinyshakespeare
|
|
dataset_source: Karpathy (2015), character-level, 1.1MB, shipped in repo
|
|
quality_target:
|
|
metric: cross_entropy_loss
|
|
value: 2.3
|
|
verified_baseline:
|
|
train_loss: 2.248
|
|
val_loss: 2.205
|
|
epochs: 25
|
|
time_seconds: 89
|
|
baseline_note: Loss curves carry over from the prior 30.3M-with-wasted-vocab config because only the 65 active token
|
|
embeddings ever updated; needs re-verification on the cleaned 11M model in a follow-up iteration.
|
|
scenario: single_stream
|
|
provenance: Vaswani et al. 2017 (Transformer); maps to MLPerf Training GPT-3/LLaMA
|
|
regime:
|
|
working_set:
|
|
value: unmeasured
|
|
peak_bytes_per_step: 67108864
|
|
note: "Per-layer activation peak ~12 MB at B=16 T=64; weights ~46 MB streamed once per epoch. Falls in the 6-48\
|
|
\ MB grey band; exact classification depends on batch and depends on caching behavior \u2014 measure with cache-miss\
|
|
\ counters."
|
|
arithmetic_intensity:
|
|
value: compute_bound
|
|
flops_per_byte: 121.747
|
|
classification_rule: intensity 121.75 vs [low 13.71, high 54.84]; util 0.222 vs [dispatch 0.25, sat 0.5]
|
|
evidence_sidecar: roofline/nanogpt-train_2026-04-16T21-02-31Z_17c2a18fb5ec.json
|
|
evidence_sha256_short: 2be2e6231554
|
|
measured_at: 2026-04-16T21-02-31Z
|
|
platform_machine_class: apple-silicon
|
|
dispatch:
|
|
value: dispatch_bound
|
|
note: Training-loop dispatch needs a roofline-emitter run; logged for iter 5.
|
|
utilization: 0.2225
|
|
achieved_bw_gbps: 26.419
|
|
classification_rule: intensity 121.75 vs [low 13.71, high 54.84]; util 0.222 vs [dispatch 0.25, sat 0.5]
|
|
evidence_sidecar: roofline/nanogpt-train_2026-04-16T21-02-31Z_17c2a18fb5ec.json
|
|
evidence_sha256_short: 2be2e6231554
|
|
measured_at: 2026-04-16T21-02-31Z
|
|
platform_machine_class: apple-silicon
|
|
nano-moe-train:
|
|
model: nano-moe-12m
|
|
params: 17.4M
|
|
dataset: tinyshakespeare
|
|
dataset_source: Karpathy (2015), character-level, 1.1MB, shipped in repo
|
|
quality_target:
|
|
metric: cross_entropy_loss
|
|
value: 0.05
|
|
verified_baseline:
|
|
train_loss: 0.042
|
|
val_loss: 0.042
|
|
epochs: 25
|
|
time_seconds: 158
|
|
scenario: single_stream
|
|
provenance: Shazeer et al. 2017 (MoE); 8 experts, top-2 routing
|
|
regime:
|
|
working_set:
|
|
value: unmeasured
|
|
peak_bytes_per_step: 20971520
|
|
note: "17M params, top-2 routing. ~21 MB per-step is in the 6-48 MB grey band; conditional compute pattern complicates\
|
|
\ cache modeling \u2014 measure."
|
|
arithmetic_intensity:
|
|
value: compute_bound
|
|
flops_per_byte: 123.795
|
|
classification_rule: intensity 123.79 vs [low 13.71, high 54.84]; util 0.012 vs [dispatch 0.25, sat 0.5]
|
|
evidence_sidecar: roofline/nano-moe-train_2026-04-16T21-03-07Z_f228368b83bd.json
|
|
evidence_sha256_short: b87deba47fc8
|
|
measured_at: 2026-04-16T21-03-07Z
|
|
platform_machine_class: apple-silicon
|
|
dispatch:
|
|
value: dispatch_bound
|
|
note: Routing/gating may push to dispatch_bound at small batch; needs measurement.
|
|
utilization: 0.0123
|
|
achieved_bw_gbps: 1.435
|
|
classification_rule: intensity 123.79 vs [low 13.71, high 54.84]; util 0.012 vs [dispatch 0.25, sat 0.5]
|
|
evidence_sidecar: roofline/nano-moe-train_2026-04-16T21-03-07Z_f228368b83bd.json
|
|
evidence_sha256_short: b87deba47fc8
|
|
measured_at: 2026-04-16T21-03-07Z
|
|
platform_machine_class: apple-silicon
|
|
micro-dlrm-train:
|
|
model: micro-dlrm-1m
|
|
params: 23K
|
|
dataset: movielens-100k
|
|
dataset_source: "Harper & Konstan (2015), MovieLens-100K, 100K ratings from 943 users \xD7 1682 movies; ships in data/movielens/"
|
|
quality_target:
|
|
metric: accuracy
|
|
value: 0.7
|
|
verified_baseline:
|
|
train_loss: 0.58
|
|
val_loss: 0.61
|
|
accuracy: 0.71
|
|
epochs: 25
|
|
time_seconds: 5
|
|
note: "23K params; embedding tables (943\xD78 + 1682\xD78 \u2248 21KB) fit in L1 cache"
|
|
scenario: server
|
|
provenance: Naumov et al. 2019 (DLRM); maps to MLPerf Training DLRM
|
|
regime:
|
|
working_set:
|
|
value: cache_resident
|
|
peak_bytes_per_step: 21504
|
|
note: 943x8 + 1682x8 + 21x8 ~ 21 KB embedding tables fit in L1 trivially.
|
|
arithmetic_intensity:
|
|
value: compute_bound
|
|
flops_per_byte: 179.508
|
|
classification_rule: intensity 179.51 vs [low 13.71, high 54.84]; util 0.004 vs [dispatch 0.25, sat 0.5]
|
|
evidence_sidecar: roofline/micro-dlrm-train_2026-04-16T21-03-36Z_12d9524eac39.json
|
|
evidence_sha256_short: c0d3b509c627
|
|
measured_at: 2026-04-16T21-03-36Z
|
|
platform_machine_class: apple-silicon
|
|
dispatch:
|
|
value: dispatch_bound
|
|
utilization: 0.0043
|
|
observation_source: iter-2 smoke_dlrm_dram.py probe; cache variant is dispatch-bound at any realistic batch size
|
|
achieved_bw_gbps: 0.349
|
|
classification_rule: intensity 179.51 vs [low 13.71, high 54.84]; util 0.004 vs [dispatch 0.25, sat 0.5]
|
|
evidence_sidecar: roofline/micro-dlrm-train_2026-04-16T21-03-36Z_12d9524eac39.json
|
|
evidence_sha256_short: c0d3b509c627
|
|
measured_at: 2026-04-16T21-03-36Z
|
|
platform_machine_class: apple-silicon
|
|
micro-dlrm-dram-train:
|
|
model: micro-dlrm-dram-1m
|
|
params: 512M
|
|
trainable_params: 23K
|
|
virtual_table_bytes: 2147483648
|
|
llc_capacity_factor: 170
|
|
dataset: movielens-100k
|
|
dataset_source: "Harper & Konstan (2015) \u2014 same dataset as micro-dlrm-train"
|
|
quality_target:
|
|
metric: accuracy
|
|
value: 0.65
|
|
verified_baseline:
|
|
baseline_note: Awaiting first verification run on iter-2 harness.
|
|
scenario: server
|
|
provenance: Naumov et al. 2019 (DLRM) + Weinberger et al. 2009 (hashing trick); production-realistic memory access pattern
|
|
at laptop scale
|
|
regime:
|
|
working_set:
|
|
value: dram_bound
|
|
peak_bytes_per_step: 67108864
|
|
note: 2M-row x 256-dim virtual table = 2 GB total; per-step working set ~64 MB at B=8192 with random hashing
|
|
arithmetic_intensity:
|
|
value: unmeasured
|
|
flops_per_byte: 14.919
|
|
classification_rule: intensity 14.92 vs [low 13.71, high 54.84]; util 0.019 vs [dispatch 0.25, sat 0.5]
|
|
evidence_sidecar: roofline/micro-dlrm-dram-train_2026-04-16T21-04-03Z_a229b977a809.json
|
|
evidence_sha256_short: 0f986b0977e6
|
|
measured_at: 2026-04-16T21-04-03Z
|
|
platform_machine_class: apple-silicon
|
|
dispatch:
|
|
value: dispatch_bound
|
|
utilization: 0.0188
|
|
achieved_bw_gbps: 9.933
|
|
observation_source: 'iter-2 smoke_dlrm_dram.py: m_spa=256 explicitly chosen to clear PyTorch''s ~50us dispatch floor'
|
|
classification_rule: intensity 14.92 vs [low 13.71, high 54.84]; util 0.019 vs [dispatch 0.25, sat 0.5]
|
|
evidence_sidecar: roofline/micro-dlrm-dram-train_2026-04-16T21-04-03Z_a229b977a809.json
|
|
evidence_sha256_short: 0f986b0977e6
|
|
measured_at: 2026-04-16T21-04-03Z
|
|
platform_machine_class: apple-silicon
|
|
nano-lora-finetune:
|
|
model: nanogpt-small-86m + LoRA(rank=8, alpha=16, target=c_attn)
|
|
base_params: 88.3M
|
|
trainable_params: 294912
|
|
trainable_ratio_pct: 0.334
|
|
scenario: training
|
|
provenance: "Iter-7 (Han): LoRA fine-tuning workload demonstrating PEFT.
|
|
Random-init backbone (random training data) — measures the SYSTEMS cost
|
|
of LoRA, not task accuracy. Base parameters genuinely frozen
|
|
(smoke verifies base_grad_norm == 0)."
|
|
regime:
|
|
working_set: { value: dram_bound }
|
|
arithmetic_intensity: { value: unmeasured }
|
|
dispatch: { value: unmeasured }
|
|
|
|
mobilenet-cifar100-composed-fp16:
|
|
model: MobileNetV2-cifar100 + 2:4 sparsity + fake-INT8 + fp16
|
|
params: 2.4M
|
|
effective_compression_ratio: 5.9
|
|
scenario: inference
|
|
provenance: "Iter-8 (Han): composition of three compression techniques on
|
|
MobileNetV2. Algorithmic 5.9x bytes reduction; PyTorch+MPS runtime
|
|
speedup only 1.06x (no INT8/2:4 kernels). The gap is the lesson."
|
|
regime:
|
|
working_set: { value: cache_resident }
|
|
arithmetic_intensity: { value: unmeasured }
|
|
dispatch: { value: unmeasured }
|
|
|
|
micro-dlrm-distributed:
|
|
model: micro-dlrm-1m via torch.distributed (Gloo, 2 ranks, localhost)
|
|
params: 1.0M
|
|
world_size: 2
|
|
backend: gloo
|
|
transport: loopback
|
|
scenario: training
|
|
provenance: "Iter-10 (Dean): two-process DDP smoke. Demonstrates DDP-vs-
|
|
gradient-accumulation loss equivalence (delta 0.0064 < 0.02 gate).
|
|
AllReduce overhead 0.6 ms/step on M5 Max loopback Gloo."
|
|
regime:
|
|
working_set: { value: cache_resident }
|
|
arithmetic_intensity: { value: unmeasured }
|
|
dispatch: { value: dispatch_bound, note: "Per-step 0.6 ms AllReduce dominates micro-step compute" }
|
|
|
|
nanogpt-decode-fp32-b16:
|
|
model: nanogpt-small-86m
|
|
shared_checkpoint: nanogpt-small-train
|
|
params: 88.3M
|
|
default_prefill_ctx: 1024
|
|
default_decode_steps: 8
|
|
default_batch_size: 16
|
|
scenario: server
|
|
provenance: 'Iter-6 (Han): GPT-2-Small geometry decode at fp32. Real-LLM stand-in used to populate the bandwidth-bound
|
|
serving-regime cells. Random weights for roofline characterization; iter-6.5 will train on TinyShakespeare.'
|
|
regime:
|
|
working_set:
|
|
value: dram_bound
|
|
note: 337 MB weights + 32x1024x18KB = ~590 MB KV stream per step; >> M1 LLC.
|
|
arithmetic_intensity:
|
|
value: bandwidth_bound
|
|
note: Pending sync from sidecar.
|
|
flops_per_byte: 1.81
|
|
classification_rule: intensity 1.81 vs [low 13.71, high 54.84]; util 0.119 vs [dispatch 0.25, sat 0.5]
|
|
evidence_sidecar: roofline/nanogpt-decode-fp32-b16_2026-04-16T21-45-16Z_c3118df3a1ed.json
|
|
evidence_sha256_short: 4ef99feebf5f
|
|
measured_at: 2026-04-16T21-45-16Z
|
|
platform_machine_class: apple-silicon
|
|
dispatch:
|
|
value: dispatch_bound
|
|
note: Pending sync from sidecar.
|
|
utilization: 0.1186
|
|
achieved_bw_gbps: 62.512
|
|
classification_rule: intensity 1.81 vs [low 13.71, high 54.84]; util 0.119 vs [dispatch 0.25, sat 0.5]
|
|
evidence_sidecar: roofline/nanogpt-decode-fp32-b16_2026-04-16T21-45-16Z_c3118df3a1ed.json
|
|
evidence_sha256_short: 4ef99feebf5f
|
|
measured_at: 2026-04-16T21-45-16Z
|
|
platform_machine_class: apple-silicon
|
|
nanogpt-decode-fp16-b16:
|
|
model: nanogpt-small-86m
|
|
shared_checkpoint: nanogpt-small-train
|
|
params: 88.3M
|
|
dtype: float16
|
|
default_prefill_ctx: 1024
|
|
default_decode_steps: 8
|
|
default_batch_size: 16
|
|
scenario: server
|
|
provenance: 'Iter-6 (Han): GPT-2-Small at fp16, the production default for LLM serving. Demonstrates the 1.2-1.5x speedup
|
|
that fp16 buys on MPS without custom kernels. Working set halves vs fp32.'
|
|
regime:
|
|
working_set:
|
|
value: dram_bound
|
|
note: 169 MB weights + halved KV stream; still >> LLC.
|
|
arithmetic_intensity:
|
|
value: bandwidth_bound
|
|
note: Pending sync from sidecar.
|
|
flops_per_byte: 3.62
|
|
classification_rule: intensity 3.62 vs [low 13.71, high 54.84]; util 0.011 vs [dispatch 0.25, sat 0.5]
|
|
evidence_sidecar: roofline/nanogpt-decode-fp16-b16_2026-04-16T21-45-41Z_f6c6cbd5d028.json
|
|
evidence_sha256_short: b20908e55384
|
|
measured_at: 2026-04-16T21-45-41Z
|
|
platform_machine_class: apple-silicon
|
|
dispatch:
|
|
value: dispatch_bound
|
|
note: Pending sync from sidecar.
|
|
utilization: 0.011
|
|
achieved_bw_gbps: 5.818
|
|
classification_rule: intensity 3.62 vs [low 13.71, high 54.84]; util 0.011 vs [dispatch 0.25, sat 0.5]
|
|
evidence_sidecar: roofline/nanogpt-decode-fp16-b16_2026-04-16T21-45-41Z_f6c6cbd5d028.json
|
|
evidence_sha256_short: b20908e55384
|
|
measured_at: 2026-04-16T21-45-41Z
|
|
platform_machine_class: apple-silicon
|
|
nanogpt-decode-spec:
|
|
model: nanogpt-small-86m
|
|
params: 88.3M
|
|
draft_params: 11.1M
|
|
gamma: 4
|
|
default_prefill_ctx: 1024
|
|
default_decode_tokens: 16
|
|
scenario: server
|
|
provenance: 'Iter-6 (Han): speculative decoding with 11M draft + 88M target, gamma=4, lossless argmax verify. Pedagogical
|
|
demonstration that speculation only helps when draft and target agree often (random-init scaffold here gets 0% acceptance
|
|
and is therefore SLOWER than baseline -- the lesson is that the draft must be trained to mimic the target distribution).'
|
|
regime:
|
|
working_set:
|
|
value: dram_bound
|
|
arithmetic_intensity:
|
|
value: bandwidth_bound
|
|
note: Pending sync from sidecar.
|
|
flops_per_byte: 3.177
|
|
classification_rule: intensity 3.18 vs [low 13.71, high 54.84]; util 0.006 vs [dispatch 0.25, sat 0.5]
|
|
evidence_sidecar: roofline/nanogpt-decode-spec_2026-04-16T21-46-07Z_c434d4e51821.json
|
|
evidence_sha256_short: b4eb96a293ba
|
|
measured_at: 2026-04-16T21-46-07Z
|
|
platform_machine_class: apple-silicon
|
|
dispatch:
|
|
value: dispatch_bound
|
|
note: Pending sync from sidecar.
|
|
utilization: 0.0059
|
|
achieved_bw_gbps: 3.127
|
|
classification_rule: intensity 3.18 vs [low 13.71, high 54.84]; util 0.006 vs [dispatch 0.25, sat 0.5]
|
|
evidence_sidecar: roofline/nanogpt-decode-spec_2026-04-16T21-46-07Z_c434d4e51821.json
|
|
evidence_sha256_short: b4eb96a293ba
|
|
measured_at: 2026-04-16T21-46-07Z
|
|
platform_machine_class: apple-silicon
|
|
nanogpt-prefill:
|
|
model: nanogpt-12m
|
|
shared_checkpoint: nanogpt-train
|
|
params: 11.5M
|
|
default_context_len: 1792
|
|
scenario: offline
|
|
provenance: GPT-2 prefill regime; corresponds to MLPerf Inference 'prompt processing' phase
|
|
regime:
|
|
working_set:
|
|
value: dram_bound
|
|
peak_bytes_per_step: 96468992
|
|
note: "Per-layer attention scores tensor (1792x1792 per head, 6 heads, fp32) is 19.3M floats = 77 MB alone \u2014\
|
|
\ well past the 4*LLC = 48 MB threshold. The cited 96 MB also includes Q/K/V and FFN activations."
|
|
arithmetic_intensity:
|
|
value: compute_bound
|
|
flops_per_byte: 289.277
|
|
classification_rule: intensity 289.28 vs [low 13.71, high 54.84]; util 0.218 vs [dispatch 0.25, sat 0.5]
|
|
evidence_sidecar: roofline/nanogpt-prefill_2026-04-16T19-29-32Z_319b7d0e1597.json
|
|
evidence_sha256_short: 3f8a14d9d478
|
|
measured_at: 2026-04-16T19-29-32Z
|
|
platform_machine_class: apple-m1-16gb
|
|
dispatch:
|
|
value: dispatch_bound
|
|
utilization: 0.2183
|
|
observation_source: iter-3 smoke_nanogpt_phases.py; prefill latency 13ms over 1792 tokens
|
|
achieved_bw_gbps: 10.913
|
|
classification_rule: intensity 289.28 vs [low 13.71, high 54.84]; util 0.218 vs [dispatch 0.25, sat 0.5]
|
|
evidence_sidecar: roofline/nanogpt-prefill_2026-04-16T19-29-32Z_319b7d0e1597.json
|
|
evidence_sha256_short: 3f8a14d9d478
|
|
measured_at: 2026-04-16T19-29-32Z
|
|
platform_machine_class: apple-m1-16gb
|
|
nanogpt-decode:
|
|
model: nanogpt-12m
|
|
shared_checkpoint: nanogpt-train
|
|
params: 11.5M
|
|
default_prefill_ctx: 1792
|
|
default_decode_steps: 64
|
|
kv_bytes_per_token: 18432
|
|
scenario: server
|
|
provenance: GPT-2 autoregressive decode; the regime that dominates LLM serving cost in production (vLLM, TensorRT-LLM,
|
|
TGI all built around this)
|
|
regime:
|
|
working_set:
|
|
value: unmeasured
|
|
peak_bytes_per_step: 34603008
|
|
note: 32 MB KV cache stream per step lands in the 6-48 MB grey band. Empirically the achieved BW (4 GB/s vs 68 GB/s
|
|
peak) suggests DRAM streaming dominates, but per-step working set technically classifies as ambiguous on this
|
|
axis. Real bottleneck is on Axis C.
|
|
arithmetic_intensity:
|
|
value: bandwidth_bound
|
|
flops_per_byte: 0.5
|
|
classification_rule: intensity 0.50 vs [low 13.71, high 54.84]; util 0.018 vs [dispatch 0.25, sat 0.5]
|
|
evidence_sidecar: roofline/nanogpt-decode_2026-04-16T21-09-50Z_f5f554162523.json
|
|
evidence_sha256_short: 038cbc9c6448
|
|
measured_at: 2026-04-16T21-09-50Z
|
|
platform_machine_class: apple-silicon
|
|
dispatch:
|
|
value: dispatch_bound
|
|
utilization: 0.0175
|
|
achieved_bw_gbps: 9.248
|
|
observation_source: iter-3 smoke_nanogpt_phases.py on M-series MPS; the canonical dispatch_bound case
|
|
classification_rule: intensity 0.50 vs [low 13.71, high 54.84]; util 0.018 vs [dispatch 0.25, sat 0.5]
|
|
evidence_sidecar: roofline/nanogpt-decode_2026-04-16T21-09-50Z_f5f554162523.json
|
|
evidence_sha256_short: 038cbc9c6448
|
|
measured_at: 2026-04-16T21-09-50Z
|
|
platform_machine_class: apple-silicon
|
|
micro-diffusion-train:
|
|
model: micro-diffusion-32px
|
|
params: 2.0M
|
|
dataset: cifar10
|
|
dataset_source: Krizhevsky (2009), auto-download via torchvision, 170MB
|
|
quality_target:
|
|
metric: mse_loss
|
|
value: 0.002
|
|
verified_baseline:
|
|
train_loss: 0.002
|
|
val_loss: 0.0
|
|
epochs: 20
|
|
time_seconds: 41
|
|
scenario: offline
|
|
provenance: Ho et al. 2020 (DDPM); U-Net denoising autoencoder
|
|
regime:
|
|
working_set:
|
|
value: unmeasured
|
|
peak_bytes_per_step: 8388608
|
|
note: "2M-param U-Net on CIFAR-10 32x32; ~8 MB per-step in the grey band \u2014 measure."
|
|
arithmetic_intensity:
|
|
value: bandwidth_bound
|
|
flops_per_byte: 3.636
|
|
classification_rule: intensity 3.64 vs [low 13.71, high 54.84]; util 0.014 vs [dispatch 0.25, sat 0.5]
|
|
evidence_sidecar: roofline/micro-diffusion-train_2026-04-16T21-04-27Z_dd0624db92be.json
|
|
evidence_sha256_short: b95acc78eacc
|
|
measured_at: 2026-04-16T21-04-27Z
|
|
platform_machine_class: apple-silicon
|
|
dispatch:
|
|
value: dispatch_bound
|
|
note: Conv kernel sizes likely large enough but verify
|
|
utilization: 0.0143
|
|
achieved_bw_gbps: 7.557
|
|
classification_rule: intensity 3.64 vs [low 13.71, high 54.84]; util 0.014 vs [dispatch 0.25, sat 0.5]
|
|
evidence_sidecar: roofline/micro-diffusion-train_2026-04-16T21-04-27Z_dd0624db92be.json
|
|
evidence_sha256_short: b95acc78eacc
|
|
measured_at: 2026-04-16T21-04-27Z
|
|
platform_machine_class: apple-silicon
|
|
micro-gnn-train:
|
|
model: micro-gnn
|
|
params: 5.6K
|
|
dataset: cora
|
|
dataset_source: Sen et al. (2008), Cora citation network (2708 nodes, 7 classes, 5429 edges); auto-download; synthetic
|
|
fallback available for offline use
|
|
quality_target:
|
|
metric: test_accuracy
|
|
value: 0.78
|
|
verified_baseline:
|
|
train_loss: 0.35
|
|
val_accuracy: 0.816
|
|
test_accuracy: 0.816
|
|
epochs: 50
|
|
time_seconds: 2
|
|
note: 81.6% matches Kipf & Welling (2017) published result on Cora
|
|
scenario: single_stream
|
|
provenance: Kipf & Welling 2017 (GCN); maps to MLPerf Training GNN
|
|
regime:
|
|
working_set:
|
|
value: cache_resident
|
|
peak_bytes_per_step: 204800
|
|
note: Cora has 2708 nodes, 5429 edges; full graph fits in L1
|
|
arithmetic_intensity:
|
|
value: bandwidth_bound
|
|
flops_per_byte: 2.0
|
|
classification_rule: sparse adjacency multiply has very low arithmetic intensity
|
|
dispatch:
|
|
value: unmeasured
|
|
note: Per-step kernels are microscopic; very likely dispatch_bound but needs measurement
|
|
micro-bert-train:
|
|
model: micro-bert
|
|
params: 432K
|
|
dataset: sst2
|
|
dataset_source: Socher et al. (2013), Stanford Sentiment Treebank binary, 67K train / 872 val; ships in data/sst2/
|
|
quality_target:
|
|
metric: val_accuracy
|
|
value: 0.78
|
|
verified_baseline:
|
|
train_loss: 0.15
|
|
val_accuracy: 0.77
|
|
epochs: 15
|
|
time_seconds: 45
|
|
note: Character-level tokenization; 77% on real SST-2 with 0.5M-param model
|
|
scenario: single_stream
|
|
provenance: Devlin et al. 2019 (BERT); bidirectional transformer for NLU
|
|
regime:
|
|
working_set:
|
|
value: cache_resident
|
|
peak_bytes_per_step: 4194304
|
|
note: 432K params, short SST-2 sequences
|
|
arithmetic_intensity:
|
|
value: compute_bound
|
|
flops_per_byte: 455.764
|
|
classification_rule: intensity 455.76 vs [low 13.71, high 54.84]; util 0.066 vs [dispatch 0.25, sat 0.5]
|
|
evidence_sidecar: roofline/micro-bert-train_2026-04-16T21-04-51Z_12f7306a7fea.json
|
|
evidence_sha256_short: 85c06029464a
|
|
measured_at: 2026-04-16T21-04-51Z
|
|
platform_machine_class: apple-silicon
|
|
dispatch:
|
|
value: dispatch_bound
|
|
utilization: 0.0658
|
|
achieved_bw_gbps: 2.089
|
|
classification_rule: intensity 455.76 vs [low 13.71, high 54.84]; util 0.066 vs [dispatch 0.25, sat 0.5]
|
|
evidence_sidecar: roofline/micro-bert-train_2026-04-16T21-04-51Z_12f7306a7fea.json
|
|
evidence_sha256_short: 85c06029464a
|
|
measured_at: 2026-04-16T21-04-51Z
|
|
platform_machine_class: apple-silicon
|
|
micro-lstm-train:
|
|
model: micro-lstm
|
|
params: 51K
|
|
dataset: etth1
|
|
dataset_source: Zhou et al. (2021, AAAI Informer), ETTh1 hourly electricity transformer temps, 17K observations; ships
|
|
in data/etth1/
|
|
quality_target:
|
|
metric: val_mse
|
|
value: 0.13
|
|
verified_baseline:
|
|
train_loss: 0.017
|
|
val_loss: 0.17
|
|
epochs: 30
|
|
time_seconds: 20
|
|
note: "Classic overfitting pattern \u2014 val MSE plateaus at epoch 5 then rises"
|
|
scenario: single_stream
|
|
provenance: Hochreiter & Schmidhuber 1997 (LSTM); time-series forecasting
|
|
regime:
|
|
working_set:
|
|
value: cache_resident
|
|
peak_bytes_per_step: 524288
|
|
note: 51K params, 96-step horizon
|
|
arithmetic_intensity:
|
|
value: compute_bound
|
|
flops_per_byte: 348.953
|
|
classification_rule: intensity 348.95 vs [low 13.71, high 54.84]; util 0.005 vs [dispatch 0.25, sat 0.5]
|
|
evidence_sidecar: roofline/micro-lstm-train_2026-04-16T21-05-16Z_101c2c64fd67.json
|
|
evidence_sha256_short: 53d6a03fd518
|
|
measured_at: 2026-04-16T21-05-16Z
|
|
platform_machine_class: apple-silicon
|
|
dispatch:
|
|
value: dispatch_bound
|
|
note: Sequential timesteps invite dispatch overhead; likely dispatch_bound, verify
|
|
utilization: 0.0046
|
|
achieved_bw_gbps: 0.192
|
|
classification_rule: intensity 348.95 vs [low 13.71, high 54.84]; util 0.005 vs [dispatch 0.25, sat 0.5]
|
|
evidence_sidecar: roofline/micro-lstm-train_2026-04-16T21-05-16Z_101c2c64fd67.json
|
|
evidence_sha256_short: 53d6a03fd518
|
|
measured_at: 2026-04-16T21-05-16Z
|
|
platform_machine_class: apple-silicon
|
|
micro-rl-train:
|
|
model: micro-rl
|
|
params: 17K
|
|
dataset: cartpole_local
|
|
dataset_source: Pure-Python CartPole environment; physics from OpenAI Gym spec, no download
|
|
quality_target:
|
|
metric: avg_episode_reward
|
|
value: 195
|
|
verified_baseline:
|
|
avg_reward: 9.5
|
|
episodes: 500
|
|
time_seconds: 1
|
|
note: "REINFORCE is high-variance by design \u2014 pedagogical value in showing RL difficulty"
|
|
scenario: single_stream
|
|
provenance: Williams 1992 (REINFORCE); policy gradient on classic control
|
|
regime:
|
|
working_set:
|
|
value: cache_resident
|
|
peak_bytes_per_step: 65536
|
|
note: 17K params, tiny actor-critic
|
|
arithmetic_intensity:
|
|
value: bandwidth_bound
|
|
flops_per_byte: 2.0
|
|
classification_rule: env step in Python dominates; nn forward is microscopic
|
|
dispatch:
|
|
value: unmeasured
|
|
note: Almost certainly dispatch_bound; verify
|
|
edge:
|
|
resnet18-train:
|
|
model: resnet18
|
|
params: 11.2M
|
|
dataset: cifar100
|
|
dataset_source: Krizhevsky (2009), auto-download via torchvision, 170MB; cached locally after first download
|
|
quality_target:
|
|
metric: top1_accuracy
|
|
value: 0.36
|
|
verified_baseline:
|
|
train_loss: 2.456
|
|
val_loss: 2.485
|
|
accuracy: 0.363
|
|
epochs: 25
|
|
time_seconds: 64
|
|
note: 36.3% after 25 epochs; systems tuning (augmentation, schedule) reaches 50%+
|
|
scenario: single_stream
|
|
provenance: "He et al. 2016 (ResNet); FULLY LOCAL implementation \u2014 no torchvision.models dependency"
|
|
regime:
|
|
working_set:
|
|
value: unmeasured
|
|
peak_bytes_per_step: 48234496
|
|
note: "11.2M params; activations + weights ~46 MB at B=64. Just under 4*LLC threshold; classification depends on\
|
|
\ whether activations stream or stay resident \u2014 measure."
|
|
arithmetic_intensity:
|
|
value: bandwidth_bound
|
|
flops_per_byte: 3.931
|
|
classification_rule: intensity 3.93 vs [low 13.71, high 54.84]; util 0.059 vs [dispatch 0.25, sat 0.5]
|
|
evidence_sidecar: roofline/resnet18-train_2026-04-16T21-05-40Z_01868a7fce6e.json
|
|
evidence_sha256_short: d600a1c86702
|
|
measured_at: 2026-04-16T21-05-40Z
|
|
platform_machine_class: apple-silicon
|
|
dispatch:
|
|
value: dispatch_bound
|
|
utilization: 0.0589
|
|
achieved_bw_gbps: 31.054
|
|
classification_rule: intensity 3.93 vs [low 13.71, high 54.84]; util 0.059 vs [dispatch 0.25, sat 0.5]
|
|
evidence_sidecar: roofline/resnet18-train_2026-04-16T21-05-40Z_01868a7fce6e.json
|
|
evidence_sha256_short: d600a1c86702
|
|
measured_at: 2026-04-16T21-05-40Z
|
|
platform_machine_class: apple-silicon
|
|
mobilenetv2-train:
|
|
model: mobilenetv2
|
|
params: 2.4M
|
|
dataset: cifar100
|
|
dataset_source: Krizhevsky (2009), auto-download via torchvision, 170MB; cached locally after first download
|
|
quality_target:
|
|
metric: top1_accuracy
|
|
value: 0.4
|
|
verified_baseline:
|
|
epochs: 15
|
|
time_seconds: 60
|
|
note: Inverted residual architecture with depthwise separable convolutions
|
|
scenario: single_stream
|
|
provenance: "Sandler et al. 2018 (MobileNetV2); FULLY LOCAL \u2014 no torchvision.models dependency"
|
|
regime:
|
|
working_set:
|
|
value: unmeasured
|
|
peak_bytes_per_step: 10485760
|
|
note: "2.4M params, depthwise-separable convs; 10 MB per-step in grey band \u2014 measure."
|
|
arithmetic_intensity:
|
|
value: bandwidth_bound
|
|
flops_per_byte: 3.691
|
|
classification_rule: intensity 3.69 vs [low 13.71, high 54.84]; util 0.005 vs [dispatch 0.25, sat 0.5]
|
|
evidence_sidecar: roofline/mobilenetv2-train_2026-04-16T21-09-06Z_6a95ef082eee.json
|
|
evidence_sha256_short: 71606a91b999
|
|
measured_at: 2026-04-16T21-09-06Z
|
|
platform_machine_class: apple-silicon
|
|
dispatch:
|
|
value: dispatch_bound
|
|
note: Despite 'edge' label, MobileNetV2 is mostly memory-bound, not compute-bound
|
|
utilization: 0.005
|
|
achieved_bw_gbps: 2.645
|
|
classification_rule: intensity 3.69 vs [low 13.71, high 54.84]; util 0.005 vs [dispatch 0.25, sat 0.5]
|
|
evidence_sidecar: roofline/mobilenetv2-train_2026-04-16T21-09-06Z_6a95ef082eee.json
|
|
evidence_sha256_short: 71606a91b999
|
|
measured_at: 2026-04-16T21-09-06Z
|
|
platform_machine_class: apple-silicon
|
|
tiny:
|
|
dscnn-kws-train:
|
|
model: dscnn-kws
|
|
params: 20K
|
|
dataset: speech_commands_v2
|
|
dataset_source: Warden (2018), auto-download via torchaudio, ~2GB
|
|
quality_target:
|
|
metric: top1_accuracy
|
|
value: 0.9
|
|
verified_baseline:
|
|
train_loss: 0.916
|
|
val_loss: 1.026
|
|
accuracy: 0.712
|
|
epochs: 10
|
|
time_seconds: 51
|
|
note: 71.2% after 10 epochs on 12-class (10 keywords + unknown + silence)
|
|
max_model_size_kb: 100
|
|
scenario: offline
|
|
provenance: Zhang et al. 2017 (Hello Edge); maps to MLPerf Tiny KWS
|
|
regime:
|
|
working_set:
|
|
value: cache_resident
|
|
peak_bytes_per_step: 2097152
|
|
note: 20K params + 40x101 spectrogram inputs
|
|
arithmetic_intensity:
|
|
value: compute_bound
|
|
flops_per_byte: 70
|
|
classification_rule: moderate intensity from spectrogram convs
|
|
dispatch:
|
|
value: unmeasured
|
|
anomaly-ae-train:
|
|
model: anomaly-ae
|
|
params: 0.3M
|
|
dataset: mnist
|
|
dataset_source: LeCun et al. 1998, auto-download via torchvision, 12MB
|
|
quality_target:
|
|
metric: reconstruction_mse
|
|
value: 0.04
|
|
verified_baseline:
|
|
train_loss: 0.034
|
|
val_loss: 0.065
|
|
epochs: 20
|
|
time_seconds: 5
|
|
note: "Val loss intentionally higher \u2014 anomalous digits have high recon error"
|
|
max_model_size_kb: 32
|
|
scenario: offline
|
|
provenance: MLPerf Tiny AD benchmark; Koizumi et al. 2019 (ToyADMOS architecture)
|
|
regime:
|
|
working_set:
|
|
value: cache_resident
|
|
peak_bytes_per_step: 2097152
|
|
note: 0.3M params FC autoencoder on MNIST
|
|
arithmetic_intensity:
|
|
value: unmeasured
|
|
flops_per_byte: 27.728
|
|
classification_rule: intensity 27.73 vs [low 13.71, high 54.84]; util 0.007 vs [dispatch 0.25, sat 0.5]
|
|
evidence_sidecar: roofline/anomaly-ae-train_2026-04-16T21-08-06Z_147f35c69389.json
|
|
evidence_sha256_short: cc6fd54bba1d
|
|
measured_at: 2026-04-16T21-08-06Z
|
|
platform_machine_class: apple-silicon
|
|
dispatch:
|
|
value: dispatch_bound
|
|
utilization: 0.0068
|
|
achieved_bw_gbps: 3.567
|
|
classification_rule: intensity 27.73 vs [low 13.71, high 54.84]; util 0.007 vs [dispatch 0.25, sat 0.5]
|
|
evidence_sidecar: roofline/anomaly-ae-train_2026-04-16T21-08-06Z_147f35c69389.json
|
|
evidence_sha256_short: cc6fd54bba1d
|
|
measured_at: 2026-04-16T21-08-06Z
|
|
platform_machine_class: apple-silicon
|
|
wake-vision-vww:
|
|
model: wake-vision-vww
|
|
params: 8.5K
|
|
dataset: wake_vision
|
|
dataset_source: Banbury et al. 2024 (CVPR), HuggingFace Harvard-Edge/Wake-Vision, 6GB; 5K pedagogical subset with CIFAR-10
|
|
proxy fallback
|
|
quality_target:
|
|
metric: binary_accuracy
|
|
value: 0.85
|
|
verified_baseline:
|
|
train_loss: 0.31
|
|
val_loss: 0.33
|
|
accuracy: 0.873
|
|
epochs: 20
|
|
time_seconds: 10
|
|
max_model_size_kb: 16
|
|
scenario: offline
|
|
provenance: Banbury et al. 2024 (Wake Vision, CVPR); maps to MLPerf Tiny VWW
|
|
regime:
|
|
working_set:
|
|
value: cache_resident
|
|
peak_bytes_per_step: 524288
|
|
note: 8.5K-param micro-CNN
|
|
arithmetic_intensity:
|
|
value: bandwidth_bound
|
|
flops_per_byte: 0.128
|
|
classification_rule: intensity 0.13 vs [low 13.71, high 54.84]; util 0.007 vs [dispatch 0.25, sat 0.5]
|
|
evidence_sidecar: roofline/wake-vision-vww_2026-04-16T21-08-42Z_99341b1751d0.json
|
|
evidence_sha256_short: acacdaca08f5
|
|
measured_at: 2026-04-16T21-08-42Z
|
|
platform_machine_class: apple-silicon
|
|
dispatch:
|
|
value: dispatch_bound
|
|
note: Tiny model may push dispatch overhead per step; verify
|
|
utilization: 0.0073
|
|
achieved_bw_gbps: 3.873
|
|
classification_rule: intensity 0.13 vs [low 13.71, high 54.84]; util 0.007 vs [dispatch 0.25, sat 0.5]
|
|
evidence_sidecar: roofline/wake-vision-vww_2026-04-16T21-08-42Z_99341b1751d0.json
|
|
evidence_sha256_short: acacdaca08f5
|
|
measured_at: 2026-04-16T21-08-42Z
|
|
platform_machine_class: apple-silicon
|
|
agent:
|
|
nano-rag-agent:
|
|
model: nano-rag-agent
|
|
params: 20.1M
|
|
dataset: react_traces
|
|
dataset_source: Structured multi-step reasoning traces; ReAct format
|
|
quality_target:
|
|
metric: retrieval_accuracy
|
|
value: 0.8
|
|
metrics:
|
|
- retrieve_latency_ms
|
|
- generate_latency_ms
|
|
- queries_per_second
|
|
scenario: server
|
|
provenance: Lewis et al. 2020 (RAG); measures retrieval vs generation bottleneck
|
|
regime:
|
|
working_set:
|
|
value: dram_bound
|
|
peak_bytes_per_step: 88080384
|
|
note: 'Generation phase: 20M-param decode (~80 MB activations + weights re-read) + retrieval index lookup.'
|
|
arithmetic_intensity:
|
|
value: bandwidth_bound
|
|
flops_per_byte: 0.7
|
|
classification_rule: generation is decode (bw-bound); retrieval is index gather
|
|
dispatch:
|
|
value: dispatch_bound
|
|
utilization: 0.1
|
|
observation_source: 'composite workload: decode + retrieval, both small-kernel'
|
|
nano-codegen-agent:
|
|
model: nano-codegen-agent
|
|
params: 13.7M
|
|
dataset: mbpp
|
|
dataset_source: Austin et al. 2021 (MBPP); 20 curated Python problems with unit tests
|
|
quality_target:
|
|
metric: pass_at_1
|
|
value: 0.15
|
|
metrics:
|
|
- iterations_to_correct
|
|
- tokens_per_attempt
|
|
- context_growth_factor
|
|
scenario: server
|
|
provenance: Chen et al. 2021 (Codex); measures iterative retry cost
|
|
regime:
|
|
working_set:
|
|
value: dram_bound
|
|
peak_bytes_per_step: 60817408
|
|
note: 13.7M-param iterative regeneration; per-step working set ~55 MB exceeds 4*LLC.
|
|
arithmetic_intensity:
|
|
value: bandwidth_bound
|
|
flops_per_byte: 0.7
|
|
classification_rule: iterative decode loop
|
|
dispatch:
|
|
value: dispatch_bound
|
|
utilization: 0.1
|
|
nano-react-agent:
|
|
model: nano-react-agent
|
|
params: 13.7M
|
|
dataset: react_traces
|
|
dataset_source: Structured multi-step reasoning traces; ReAct format
|
|
quality_target:
|
|
metric: trace_accuracy
|
|
value: 0.6
|
|
metrics:
|
|
- steps_to_answer
|
|
- reasoning_latency_per_step_ms
|
|
- tool_dispatch_latency_ms
|
|
scenario: server
|
|
provenance: Yao et al. 2023 (ReAct); measures multi-step reasoning + tool use
|
|
regime:
|
|
working_set:
|
|
value: dram_bound
|
|
peak_bytes_per_step: 60817408
|
|
note: "13.7M-param multi-step reasoning; iter-2 found this still uses non-KV-cache forward \u2014 current 58 MB\
|
|
\ number assumes the (broken) recompute path; will shrink once iter-7 lands KV-cache."
|
|
arithmetic_intensity:
|
|
value: bandwidth_bound
|
|
flops_per_byte: 0.7
|
|
classification_rule: ReAct decode loop
|
|
dispatch:
|
|
value: dispatch_bound
|
|
utilization: 0.1
|
|
note: To be patched in iter-7 to use KV-cache path; regime values may shift
|
|
nano-toolcall-agent:
|
|
model: nano-toolcall-agent
|
|
params: 13.7M
|
|
dataset: react_traces
|
|
dataset_source: Structured multi-step reasoning traces; ReAct format
|
|
metrics:
|
|
- classification_latency_ms
|
|
- json_validity_rate
|
|
- queries_per_second
|
|
scenario: server
|
|
provenance: Schick et al. 2024 (Toolformer); measures structured output generation
|
|
regime:
|
|
working_set:
|
|
value: dram_bound
|
|
peak_bytes_per_step: 60817408
|
|
note: Bonus workload (not in core 16); structured output generation
|
|
arithmetic_intensity:
|
|
value: bandwidth_bound
|
|
flops_per_byte: 0.7
|
|
classification_rule: tool-call decode
|
|
dispatch:
|
|
value: dispatch_bound
|
|
utilization: 0.1
|