mirror of
https://github.com/harvard-edge/cs249r_book.git
synced 2026-05-07 18:18:42 -05:00
- roofline-model.svg: Classic Roofline with LLM decode + CNN training points - iron-law-decomposition.svg: Iron Law equation with wall-to-term mapping - serving-two-phases.svg: Prefill (compute) vs Decode (memory) phases - allreduce-ring.svg: 8-GPU ring with reduce-scatter + all-gather - hardware-spectrum.svg: nRF52840 → ESP32 → Jetson → H100 → NVL72 scale - carbon-geography.svg: Norway/Quebec/US/Poland bar chart (41x gap) All follow svg-style.md: 900x500 viewBox, semantic colors, Helvetica font.
1939 lines
48 KiB
JSON
1939 lines
48 KiB
JSON
{
|
|
"version": "1.0.0",
|
|
"description": "Applicability matrix: which topic\u00d7track combinations produce meaningful ML systems interview questions. Exclusions are grounded in hardware physics \u2014 a combination is excluded when the underlying concept has no physical substrate on that hardware tier.",
|
|
"methodology": "A topic\u00d7track pair is marked not-applicable if it has < 3 questions in a 7,500+ question corpus generated across all combinations. Each exclusion includes a physics-grounded reason explaining why the concept does not apply to that hardware tier.",
|
|
"stats": {
|
|
"total_topics": 79,
|
|
"tracks": [
|
|
"cloud",
|
|
"edge",
|
|
"mobile",
|
|
"tinyml"
|
|
],
|
|
"zones": [
|
|
"recall",
|
|
"analyze",
|
|
"design",
|
|
"implement",
|
|
"diagnosis",
|
|
"specification",
|
|
"fluency",
|
|
"evaluation",
|
|
"realization",
|
|
"optimization",
|
|
"mastery"
|
|
],
|
|
"applicable_topic_track_pairs": 233,
|
|
"excluded_topic_track_pairs": 83,
|
|
"applicable_cells": 2563,
|
|
"excluded_cells": 913,
|
|
"exclusion_rate": "26.3%"
|
|
},
|
|
"matrix": [
|
|
{
|
|
"topic": "3d-parallelism",
|
|
"name": "3D Parallelism",
|
|
"area": "parallelism",
|
|
"tracks": {
|
|
"cloud": {
|
|
"applicable": true,
|
|
"question_count": 44
|
|
},
|
|
"edge": {
|
|
"applicable": true,
|
|
"question_count": 9
|
|
},
|
|
"mobile": {
|
|
"applicable": false,
|
|
"question_count": 11,
|
|
"reason": "3D parallelism requires multi-node GPU clusters; mobile devices are single-SoC"
|
|
},
|
|
"tinyml": {
|
|
"applicable": false,
|
|
"question_count": 8,
|
|
"reason": "3D parallelism requires multi-node GPU clusters; MCUs are single-core devices"
|
|
}
|
|
}
|
|
},
|
|
{
|
|
"topic": "ab-rollout-strategies",
|
|
"name": "A/B & Rollout Strategies",
|
|
"area": "deployment",
|
|
"tracks": {
|
|
"cloud": {
|
|
"applicable": true,
|
|
"question_count": 12
|
|
},
|
|
"edge": {
|
|
"applicable": true,
|
|
"question_count": 22
|
|
},
|
|
"mobile": {
|
|
"applicable": true,
|
|
"question_count": 23
|
|
},
|
|
"tinyml": {
|
|
"applicable": false,
|
|
"question_count": 0,
|
|
"reason": "Insufficient evidence in 7,500+ corpus (0 questions)"
|
|
}
|
|
}
|
|
},
|
|
{
|
|
"topic": "accelerator-comparison",
|
|
"name": "Accelerator Comparison",
|
|
"area": "compute",
|
|
"tracks": {
|
|
"cloud": {
|
|
"applicable": true,
|
|
"question_count": 13
|
|
},
|
|
"edge": {
|
|
"applicable": true,
|
|
"question_count": 24
|
|
},
|
|
"mobile": {
|
|
"applicable": true,
|
|
"question_count": 26
|
|
},
|
|
"tinyml": {
|
|
"applicable": true,
|
|
"question_count": 21
|
|
}
|
|
}
|
|
},
|
|
{
|
|
"topic": "activation-memory",
|
|
"name": "Activation Memory & Checkpointing",
|
|
"area": "memory",
|
|
"tracks": {
|
|
"cloud": {
|
|
"applicable": true,
|
|
"question_count": 21
|
|
},
|
|
"edge": {
|
|
"applicable": true,
|
|
"question_count": 19
|
|
},
|
|
"mobile": {
|
|
"applicable": false,
|
|
"question_count": 1,
|
|
"reason": "Mobile NPUs manage activations in hardware; no programmer-visible activation memory"
|
|
},
|
|
"tinyml": {
|
|
"applicable": false,
|
|
"question_count": 1,
|
|
"reason": "MCU activations fit in tensor arena (<100 KB); checkpointing overhead > benefit"
|
|
}
|
|
}
|
|
},
|
|
{
|
|
"topic": "adversarial-robustness",
|
|
"name": "Adversarial Robustness & Security",
|
|
"area": "reliability",
|
|
"tracks": {
|
|
"cloud": {
|
|
"applicable": true,
|
|
"question_count": 14
|
|
},
|
|
"edge": {
|
|
"applicable": true,
|
|
"question_count": 25
|
|
},
|
|
"mobile": {
|
|
"applicable": true,
|
|
"question_count": 23
|
|
},
|
|
"tinyml": {
|
|
"applicable": true,
|
|
"question_count": 22
|
|
}
|
|
}
|
|
},
|
|
{
|
|
"topic": "attention-scaling",
|
|
"name": "Attention Scaling & Variants",
|
|
"area": "architecture",
|
|
"tracks": {
|
|
"cloud": {
|
|
"applicable": true,
|
|
"question_count": 16
|
|
},
|
|
"edge": {
|
|
"applicable": true,
|
|
"question_count": 11
|
|
},
|
|
"mobile": {
|
|
"applicable": true,
|
|
"question_count": 18
|
|
},
|
|
"tinyml": {
|
|
"applicable": true,
|
|
"question_count": 5
|
|
}
|
|
}
|
|
},
|
|
{
|
|
"topic": "batching-strategies",
|
|
"name": "Batching Strategies",
|
|
"area": "latency",
|
|
"tracks": {
|
|
"cloud": {
|
|
"applicable": true,
|
|
"question_count": 42
|
|
},
|
|
"edge": {
|
|
"applicable": true,
|
|
"question_count": 26
|
|
},
|
|
"mobile": {
|
|
"applicable": false,
|
|
"question_count": 1,
|
|
"reason": "Mobile inference is single-request; no batching dimension"
|
|
},
|
|
"tinyml": {
|
|
"applicable": false,
|
|
"question_count": 0,
|
|
"reason": "MCUs process one input at a time; batching requires memory MCUs lack"
|
|
}
|
|
}
|
|
},
|
|
{
|
|
"topic": "cnn-efficient-design",
|
|
"name": "CNN Efficient Design",
|
|
"area": "architecture",
|
|
"tracks": {
|
|
"cloud": {
|
|
"applicable": true,
|
|
"question_count": 5
|
|
},
|
|
"edge": {
|
|
"applicable": true,
|
|
"question_count": 23
|
|
},
|
|
"mobile": {
|
|
"applicable": true,
|
|
"question_count": 15
|
|
},
|
|
"tinyml": {
|
|
"applicable": true,
|
|
"question_count": 53
|
|
}
|
|
}
|
|
},
|
|
{
|
|
"topic": "collective-communication",
|
|
"name": "Collective Communication",
|
|
"area": "networking",
|
|
"tracks": {
|
|
"cloud": {
|
|
"applicable": true,
|
|
"question_count": 200
|
|
},
|
|
"edge": {
|
|
"applicable": true,
|
|
"question_count": 14
|
|
},
|
|
"mobile": {
|
|
"applicable": true,
|
|
"question_count": 15
|
|
},
|
|
"tinyml": {
|
|
"applicable": true,
|
|
"question_count": 16
|
|
}
|
|
}
|
|
},
|
|
{
|
|
"topic": "compound-ai-systems",
|
|
"name": "Compound AI Systems",
|
|
"area": "deployment",
|
|
"tracks": {
|
|
"cloud": {
|
|
"applicable": true,
|
|
"question_count": 57
|
|
},
|
|
"edge": {
|
|
"applicable": true,
|
|
"question_count": 21
|
|
},
|
|
"mobile": {
|
|
"applicable": true,
|
|
"question_count": 5
|
|
},
|
|
"tinyml": {
|
|
"applicable": false,
|
|
"question_count": 0,
|
|
"reason": "MCUs can't run multi-model pipelines; insufficient memory for even one LLM"
|
|
}
|
|
}
|
|
},
|
|
{
|
|
"topic": "compute-cost-estimation",
|
|
"name": "Compute Cost Estimation",
|
|
"area": "compute",
|
|
"tracks": {
|
|
"cloud": {
|
|
"applicable": true,
|
|
"question_count": 11
|
|
},
|
|
"edge": {
|
|
"applicable": true,
|
|
"question_count": 22
|
|
},
|
|
"mobile": {
|
|
"applicable": true,
|
|
"question_count": 18
|
|
},
|
|
"tinyml": {
|
|
"applicable": true,
|
|
"question_count": 19
|
|
}
|
|
}
|
|
},
|
|
{
|
|
"topic": "congestion-control",
|
|
"name": "Congestion & Flow Control",
|
|
"area": "networking",
|
|
"tracks": {
|
|
"cloud": {
|
|
"applicable": true,
|
|
"question_count": 24
|
|
},
|
|
"edge": {
|
|
"applicable": false,
|
|
"question_count": 1,
|
|
"reason": "ECMP/PFC require multi-tier switch fabrics; edge devices use point-to-point links"
|
|
},
|
|
"mobile": {
|
|
"applicable": false,
|
|
"question_count": 0,
|
|
"reason": "No multi-hop packet-switched fabric on mobile SoCs"
|
|
},
|
|
"tinyml": {
|
|
"applicable": false,
|
|
"question_count": 0,
|
|
"reason": "MCUs use SPI/I2C buses, not packet-switched networks"
|
|
}
|
|
}
|
|
},
|
|
{
|
|
"topic": "container-orchestration",
|
|
"name": "Container & Cluster Orchestration",
|
|
"area": "deployment",
|
|
"tracks": {
|
|
"cloud": {
|
|
"applicable": true,
|
|
"question_count": 23
|
|
},
|
|
"edge": {
|
|
"applicable": false,
|
|
"question_count": 2,
|
|
"reason": "K8s GPU scheduling assumes shared cluster; edge deploys single-device"
|
|
},
|
|
"mobile": {
|
|
"applicable": false,
|
|
"question_count": 0,
|
|
"reason": "No container runtime on mobile OSes (iOS/Android sandbox model)"
|
|
},
|
|
"tinyml": {
|
|
"applicable": false,
|
|
"question_count": 0,
|
|
"reason": "No OS or container runtime on bare-metal MCUs"
|
|
}
|
|
}
|
|
},
|
|
{
|
|
"topic": "data-efficiency-selection",
|
|
"name": "Data Efficiency & Selection",
|
|
"area": "data",
|
|
"tracks": {
|
|
"cloud": {
|
|
"applicable": true,
|
|
"question_count": 13
|
|
},
|
|
"edge": {
|
|
"applicable": true,
|
|
"question_count": 26
|
|
},
|
|
"mobile": {
|
|
"applicable": true,
|
|
"question_count": 18
|
|
},
|
|
"tinyml": {
|
|
"applicable": false,
|
|
"question_count": 0,
|
|
"reason": "TinyML uses pre-trained models; no on-device data selection"
|
|
}
|
|
}
|
|
},
|
|
{
|
|
"topic": "data-parallelism",
|
|
"name": "Data Parallelism",
|
|
"area": "parallelism",
|
|
"tracks": {
|
|
"cloud": {
|
|
"applicable": true,
|
|
"question_count": 76
|
|
},
|
|
"edge": {
|
|
"applicable": true,
|
|
"question_count": 5
|
|
},
|
|
"mobile": {
|
|
"applicable": false,
|
|
"question_count": 1,
|
|
"reason": "Single-device mobile inference; no data-parallel training"
|
|
},
|
|
"tinyml": {
|
|
"applicable": false,
|
|
"question_count": 0,
|
|
"reason": "MCUs don't train; data parallelism requires distributed compute"
|
|
}
|
|
}
|
|
},
|
|
{
|
|
"topic": "data-pipeline-engineering",
|
|
"name": "Data Pipeline Engineering",
|
|
"area": "data",
|
|
"tracks": {
|
|
"cloud": {
|
|
"applicable": true,
|
|
"question_count": 220
|
|
},
|
|
"edge": {
|
|
"applicable": true,
|
|
"question_count": 48
|
|
},
|
|
"mobile": {
|
|
"applicable": true,
|
|
"question_count": 24
|
|
},
|
|
"tinyml": {
|
|
"applicable": true,
|
|
"question_count": 30
|
|
}
|
|
}
|
|
},
|
|
{
|
|
"topic": "data-quality-validation",
|
|
"name": "Data Quality & Validation",
|
|
"area": "data",
|
|
"tracks": {
|
|
"cloud": {
|
|
"applicable": true,
|
|
"question_count": 16
|
|
},
|
|
"edge": {
|
|
"applicable": true,
|
|
"question_count": 23
|
|
},
|
|
"mobile": {
|
|
"applicable": true,
|
|
"question_count": 21
|
|
},
|
|
"tinyml": {
|
|
"applicable": false,
|
|
"question_count": 0,
|
|
"reason": "MCUs receive pre-processed inputs; no data validation pipeline"
|
|
}
|
|
}
|
|
},
|
|
{
|
|
"topic": "datacenter-efficiency",
|
|
"name": "Datacenter Efficiency",
|
|
"area": "power",
|
|
"tracks": {
|
|
"cloud": {
|
|
"applicable": true,
|
|
"question_count": 19
|
|
},
|
|
"edge": {
|
|
"applicable": false,
|
|
"question_count": 2,
|
|
"reason": "PUE/cooling metrics apply to facilities, not standalone edge boxes"
|
|
},
|
|
"mobile": {
|
|
"applicable": false,
|
|
"question_count": 2,
|
|
"reason": "No facility-level power distribution on phones"
|
|
},
|
|
"tinyml": {
|
|
"applicable": false,
|
|
"question_count": 1,
|
|
"reason": "MCUs draw milliwatts; no cooling or power distribution unit"
|
|
}
|
|
}
|
|
},
|
|
{
|
|
"topic": "dataset-curation",
|
|
"name": "Dataset Curation & Labeling",
|
|
"area": "data",
|
|
"tracks": {
|
|
"cloud": {
|
|
"applicable": true,
|
|
"question_count": 13
|
|
},
|
|
"edge": {
|
|
"applicable": true,
|
|
"question_count": 22
|
|
},
|
|
"mobile": {
|
|
"applicable": true,
|
|
"question_count": 22
|
|
},
|
|
"tinyml": {
|
|
"applicable": true,
|
|
"question_count": 26
|
|
}
|
|
}
|
|
},
|
|
{
|
|
"topic": "differential-privacy",
|
|
"name": "Differential Privacy",
|
|
"area": "cross-cutting",
|
|
"tracks": {
|
|
"cloud": {
|
|
"applicable": true,
|
|
"question_count": 11
|
|
},
|
|
"edge": {
|
|
"applicable": true,
|
|
"question_count": 2,
|
|
"reason": "Edge devices process user data locally; DP-SGD and membership inference protection apply wherever training data is handled"
|
|
},
|
|
"mobile": {
|
|
"applicable": true,
|
|
"question_count": 22
|
|
},
|
|
"tinyml": {
|
|
"applicable": false,
|
|
"question_count": 0,
|
|
"reason": "MCUs don't train or store datasets; DP is not applicable"
|
|
}
|
|
}
|
|
},
|
|
{
|
|
"topic": "distribution-drift-detection",
|
|
"name": "Distribution & Drift Detection",
|
|
"area": "reliability",
|
|
"tracks": {
|
|
"cloud": {
|
|
"applicable": true,
|
|
"question_count": 8
|
|
},
|
|
"edge": {
|
|
"applicable": true,
|
|
"question_count": 23
|
|
},
|
|
"mobile": {
|
|
"applicable": true,
|
|
"question_count": 19
|
|
},
|
|
"tinyml": {
|
|
"applicable": false,
|
|
"question_count": 0,
|
|
"reason": "MCUs lack statistical libraries and reference distributions"
|
|
}
|
|
}
|
|
},
|
|
{
|
|
"topic": "dma-data-movement",
|
|
"name": "DMA & Data Movement",
|
|
"area": "memory",
|
|
"tracks": {
|
|
"cloud": {
|
|
"applicable": true,
|
|
"question_count": 7
|
|
},
|
|
"edge": {
|
|
"applicable": true,
|
|
"question_count": 24
|
|
},
|
|
"mobile": {
|
|
"applicable": true,
|
|
"question_count": 0,
|
|
"reason": "Mobile NPU inference is DMA-bound; VTCM loading, camera ISP-to-ML zero-copy, and DRAM bandwidth contention are core performance factors"
|
|
},
|
|
"tinyml": {
|
|
"applicable": true,
|
|
"question_count": 23
|
|
}
|
|
}
|
|
},
|
|
{
|
|
"topic": "duty-cycling",
|
|
"name": "Duty Cycling & Energy Harvesting",
|
|
"area": "power",
|
|
"tracks": {
|
|
"cloud": {
|
|
"applicable": false,
|
|
"question_count": 0,
|
|
"reason": "Datacenter GPUs run 24/7 at full utilization; duty cycling wastes CapEx"
|
|
},
|
|
"edge": {
|
|
"applicable": true,
|
|
"question_count": 3
|
|
},
|
|
"mobile": {
|
|
"applicable": true,
|
|
"question_count": 12
|
|
},
|
|
"tinyml": {
|
|
"applicable": true,
|
|
"question_count": 50
|
|
}
|
|
}
|
|
},
|
|
{
|
|
"topic": "encoder-decoder-tradeoffs",
|
|
"name": "Encoder-Decoder Tradeoffs",
|
|
"area": "architecture",
|
|
"tracks": {
|
|
"cloud": {
|
|
"applicable": true,
|
|
"question_count": 9
|
|
},
|
|
"edge": {
|
|
"applicable": true,
|
|
"question_count": 21
|
|
},
|
|
"mobile": {
|
|
"applicable": true,
|
|
"question_count": 20
|
|
},
|
|
"tinyml": {
|
|
"applicable": false,
|
|
"question_count": 0,
|
|
"reason": "MCUs run tiny CNNs/RNNs; encoder-decoder architectures don't fit in SRAM"
|
|
}
|
|
}
|
|
},
|
|
{
|
|
"topic": "energy-per-operation",
|
|
"name": "Energy Per Operation",
|
|
"area": "power",
|
|
"tracks": {
|
|
"cloud": {
|
|
"applicable": true,
|
|
"question_count": 23
|
|
},
|
|
"edge": {
|
|
"applicable": true,
|
|
"question_count": 28
|
|
},
|
|
"mobile": {
|
|
"applicable": true,
|
|
"question_count": 20
|
|
},
|
|
"tinyml": {
|
|
"applicable": true,
|
|
"question_count": 21
|
|
}
|
|
}
|
|
},
|
|
{
|
|
"topic": "extreme-quantization",
|
|
"name": "Extreme Quantization",
|
|
"area": "precision",
|
|
"tracks": {
|
|
"cloud": {
|
|
"applicable": true,
|
|
"question_count": 22
|
|
},
|
|
"edge": {
|
|
"applicable": true,
|
|
"question_count": 27
|
|
},
|
|
"mobile": {
|
|
"applicable": true,
|
|
"question_count": 20
|
|
},
|
|
"tinyml": {
|
|
"applicable": true,
|
|
"question_count": 24
|
|
}
|
|
}
|
|
},
|
|
{
|
|
"topic": "fairness-evaluation",
|
|
"name": "Fairness & Bias Evaluation",
|
|
"area": "cross-cutting",
|
|
"tracks": {
|
|
"cloud": {
|
|
"applicable": true,
|
|
"question_count": 13
|
|
},
|
|
"edge": {
|
|
"applicable": true,
|
|
"question_count": 22
|
|
},
|
|
"mobile": {
|
|
"applicable": true,
|
|
"question_count": 20
|
|
},
|
|
"tinyml": {
|
|
"applicable": false,
|
|
"question_count": 0,
|
|
"reason": "MCUs run keyword spotting / anomaly detection; fairness metrics not applicable"
|
|
}
|
|
}
|
|
},
|
|
{
|
|
"topic": "fault-tolerance-checkpointing",
|
|
"name": "Fault Tolerance & Checkpointing",
|
|
"area": "reliability",
|
|
"tracks": {
|
|
"cloud": {
|
|
"applicable": true,
|
|
"question_count": 194
|
|
},
|
|
"edge": {
|
|
"applicable": true,
|
|
"question_count": 16
|
|
},
|
|
"mobile": {
|
|
"applicable": true,
|
|
"question_count": 7
|
|
},
|
|
"tinyml": {
|
|
"applicable": false,
|
|
"question_count": 2,
|
|
"reason": "MCU inference takes milliseconds; no checkpoint needed for sub-second tasks"
|
|
}
|
|
}
|
|
},
|
|
{
|
|
"topic": "feature-store-management",
|
|
"name": "Feature Store & Feature Engineering",
|
|
"area": "data",
|
|
"tracks": {
|
|
"cloud": {
|
|
"applicable": true,
|
|
"question_count": 25
|
|
},
|
|
"edge": {
|
|
"applicable": false,
|
|
"question_count": 0,
|
|
"reason": "Feature stores require database infrastructure; edge serves pre-computed features"
|
|
},
|
|
"mobile": {
|
|
"applicable": false,
|
|
"question_count": 0,
|
|
"reason": "Mobile apps consume features via API, don't manage stores"
|
|
},
|
|
"tinyml": {
|
|
"applicable": false,
|
|
"question_count": 0,
|
|
"reason": "MCUs run fixed models; no feature engineering pipeline"
|
|
}
|
|
}
|
|
},
|
|
{
|
|
"topic": "federated-learning",
|
|
"name": "Federated Learning",
|
|
"area": "cross-cutting",
|
|
"tracks": {
|
|
"cloud": {
|
|
"applicable": true,
|
|
"question_count": 69
|
|
},
|
|
"edge": {
|
|
"applicable": true,
|
|
"question_count": 60
|
|
},
|
|
"mobile": {
|
|
"applicable": true,
|
|
"question_count": 71
|
|
},
|
|
"tinyml": {
|
|
"applicable": true,
|
|
"question_count": 44
|
|
}
|
|
}
|
|
},
|
|
{
|
|
"topic": "flash-attention",
|
|
"name": "IO-Aware Attention",
|
|
"area": "optimization",
|
|
"tracks": {
|
|
"cloud": {
|
|
"applicable": true,
|
|
"question_count": 20
|
|
},
|
|
"edge": {
|
|
"applicable": false,
|
|
"question_count": 1,
|
|
"reason": "Flash attention requires HBM tiling; edge LPDDR has flat hierarchy"
|
|
},
|
|
"mobile": {
|
|
"applicable": false,
|
|
"question_count": 2,
|
|
"reason": "Mobile NPUs run fused attention kernels; no programmer-visible SRAM tiling"
|
|
},
|
|
"tinyml": {
|
|
"applicable": false,
|
|
"question_count": 1,
|
|
"reason": "Attention is rarely used on MCUs; SRAM too small for any attention variant"
|
|
}
|
|
}
|
|
},
|
|
{
|
|
"topic": "gpu-compute-architecture",
|
|
"name": "GPU Compute Architecture",
|
|
"area": "compute",
|
|
"tracks": {
|
|
"cloud": {
|
|
"applicable": true,
|
|
"question_count": 10
|
|
},
|
|
"edge": {
|
|
"applicable": true,
|
|
"question_count": 22
|
|
},
|
|
"mobile": {
|
|
"applicable": false,
|
|
"question_count": 1,
|
|
"reason": "Mobile uses NPU/DSP, not GPU CUDA cores; different architecture"
|
|
},
|
|
"tinyml": {
|
|
"applicable": false,
|
|
"question_count": 0,
|
|
"reason": "MCUs have no GPU; they use fixed-function MAC units"
|
|
}
|
|
}
|
|
},
|
|
{
|
|
"topic": "graceful-degradation",
|
|
"name": "Graceful Degradation",
|
|
"area": "reliability",
|
|
"tracks": {
|
|
"cloud": {
|
|
"applicable": true,
|
|
"question_count": 15
|
|
},
|
|
"edge": {
|
|
"applicable": true,
|
|
"question_count": 22
|
|
},
|
|
"mobile": {
|
|
"applicable": true,
|
|
"question_count": 21
|
|
},
|
|
"tinyml": {
|
|
"applicable": true,
|
|
"question_count": 22
|
|
}
|
|
}
|
|
},
|
|
{
|
|
"topic": "gradient-synchronization",
|
|
"name": "Gradient Synchronization",
|
|
"area": "parallelism",
|
|
"tracks": {
|
|
"cloud": {
|
|
"applicable": true,
|
|
"question_count": 22
|
|
},
|
|
"edge": {
|
|
"applicable": false,
|
|
"question_count": 1,
|
|
"reason": "Edge inference-only; no distributed training gradients"
|
|
},
|
|
"mobile": {
|
|
"applicable": false,
|
|
"question_count": 1,
|
|
"reason": "Mobile runs inference; no gradient communication"
|
|
},
|
|
"tinyml": {
|
|
"applicable": false,
|
|
"question_count": 1,
|
|
"reason": "MCUs don't train; no gradients to synchronize"
|
|
}
|
|
}
|
|
},
|
|
{
|
|
"topic": "graph-compilation",
|
|
"name": "Graph Compilation & Optimization",
|
|
"area": "optimization",
|
|
"tracks": {
|
|
"cloud": {
|
|
"applicable": true,
|
|
"question_count": 45
|
|
},
|
|
"edge": {
|
|
"applicable": true,
|
|
"question_count": 27
|
|
},
|
|
"mobile": {
|
|
"applicable": true,
|
|
"question_count": 33
|
|
},
|
|
"tinyml": {
|
|
"applicable": true,
|
|
"question_count": 30
|
|
}
|
|
}
|
|
},
|
|
{
|
|
"topic": "interconnect-topology",
|
|
"name": "Interconnect Topology",
|
|
"area": "networking",
|
|
"tracks": {
|
|
"cloud": {
|
|
"applicable": true,
|
|
"question_count": 14
|
|
},
|
|
"edge": {
|
|
"applicable": false,
|
|
"question_count": 2,
|
|
"reason": "Edge devices are standalone; no multi-device interconnect topology"
|
|
},
|
|
"mobile": {
|
|
"applicable": true,
|
|
"question_count": 7
|
|
},
|
|
"tinyml": {
|
|
"applicable": false,
|
|
"question_count": 0,
|
|
"reason": "MCUs are single-chip; no interconnect"
|
|
}
|
|
}
|
|
},
|
|
{
|
|
"topic": "kernel-fusion",
|
|
"name": "Kernel & Operator Fusion",
|
|
"area": "optimization",
|
|
"tracks": {
|
|
"cloud": {
|
|
"applicable": true,
|
|
"question_count": 18
|
|
},
|
|
"edge": {
|
|
"applicable": true,
|
|
"question_count": 30
|
|
},
|
|
"mobile": {
|
|
"applicable": true,
|
|
"question_count": 11
|
|
},
|
|
"tinyml": {
|
|
"applicable": false,
|
|
"question_count": 2,
|
|
"reason": "MCU inference uses pre-compiled operators; no runtime kernel fusion"
|
|
}
|
|
}
|
|
},
|
|
{
|
|
"topic": "knowledge-distillation",
|
|
"name": "Knowledge Distillation",
|
|
"area": "optimization",
|
|
"tracks": {
|
|
"cloud": {
|
|
"applicable": true,
|
|
"question_count": 14
|
|
},
|
|
"edge": {
|
|
"applicable": true,
|
|
"question_count": 24
|
|
},
|
|
"mobile": {
|
|
"applicable": true,
|
|
"question_count": 18
|
|
},
|
|
"tinyml": {
|
|
"applicable": true,
|
|
"question_count": 19
|
|
}
|
|
}
|
|
},
|
|
{
|
|
"topic": "kv-cache-management",
|
|
"name": "KV-Cache Management",
|
|
"area": "memory",
|
|
"tracks": {
|
|
"cloud": {
|
|
"applicable": true,
|
|
"question_count": 13
|
|
},
|
|
"edge": {
|
|
"applicable": false,
|
|
"question_count": 0,
|
|
"reason": "Edge devices rarely run autoregressive LLMs; KV cache not relevant"
|
|
},
|
|
"mobile": {
|
|
"applicable": true,
|
|
"question_count": 20
|
|
},
|
|
"tinyml": {
|
|
"applicable": false,
|
|
"question_count": 0,
|
|
"reason": "MCUs can't run transformer inference; no KV cache"
|
|
}
|
|
}
|
|
},
|
|
{
|
|
"topic": "latency-decomposition",
|
|
"name": "Latency Decomposition",
|
|
"area": "latency",
|
|
"tracks": {
|
|
"cloud": {
|
|
"applicable": true,
|
|
"question_count": 123
|
|
},
|
|
"edge": {
|
|
"applicable": true,
|
|
"question_count": 32
|
|
},
|
|
"mobile": {
|
|
"applicable": true,
|
|
"question_count": 46
|
|
},
|
|
"tinyml": {
|
|
"applicable": true,
|
|
"question_count": 26
|
|
}
|
|
}
|
|
},
|
|
{
|
|
"topic": "load-balancing",
|
|
"name": "Load Balancing & Routing",
|
|
"area": "networking",
|
|
"tracks": {
|
|
"cloud": {
|
|
"applicable": true,
|
|
"question_count": 17
|
|
},
|
|
"edge": {
|
|
"applicable": true,
|
|
"question_count": 27
|
|
},
|
|
"mobile": {
|
|
"applicable": false,
|
|
"question_count": 0,
|
|
"reason": "Mobile runs single model on single SoC; no load to balance"
|
|
},
|
|
"tinyml": {
|
|
"applicable": false,
|
|
"question_count": 0,
|
|
"reason": "MCUs serve one model; no load balancing"
|
|
}
|
|
}
|
|
},
|
|
{
|
|
"topic": "mcu-compute-constraints",
|
|
"name": "MCU Compute Constraints",
|
|
"area": "compute",
|
|
"tracks": {
|
|
"cloud": {
|
|
"applicable": false,
|
|
"question_count": 0,
|
|
"reason": "Cloud GPUs have TB-scale memory and TFLOPS; MCU constraints don't apply"
|
|
},
|
|
"edge": {
|
|
"applicable": false,
|
|
"question_count": 0,
|
|
"reason": "Edge accelerators (Jetson, Hailo) have GB-scale memory; not MCU-constrained"
|
|
},
|
|
"mobile": {
|
|
"applicable": false,
|
|
"question_count": 0,
|
|
"reason": "Mobile SoCs have GB memory and NPUs; beyond MCU-class"
|
|
},
|
|
"tinyml": {
|
|
"applicable": true,
|
|
"question_count": 142
|
|
}
|
|
}
|
|
},
|
|
{
|
|
"topic": "memory-hierarchy-design",
|
|
"name": "Memory Hierarchy Design",
|
|
"area": "memory",
|
|
"tracks": {
|
|
"cloud": {
|
|
"applicable": true,
|
|
"question_count": 75
|
|
},
|
|
"edge": {
|
|
"applicable": true,
|
|
"question_count": 28
|
|
},
|
|
"mobile": {
|
|
"applicable": true,
|
|
"question_count": 41
|
|
},
|
|
"tinyml": {
|
|
"applicable": true,
|
|
"question_count": 25
|
|
}
|
|
}
|
|
},
|
|
{
|
|
"topic": "memory-mapped-inference",
|
|
"name": "Memory-Mapped Inference",
|
|
"area": "memory",
|
|
"tracks": {
|
|
"cloud": {
|
|
"applicable": true,
|
|
"question_count": 3
|
|
},
|
|
"edge": {
|
|
"applicable": true,
|
|
"question_count": 23
|
|
},
|
|
"mobile": {
|
|
"applicable": true,
|
|
"question_count": 18
|
|
},
|
|
"tinyml": {
|
|
"applicable": false,
|
|
"question_count": 0,
|
|
"reason": "MCUs use XIP from flash, not mmap; different execution model"
|
|
}
|
|
}
|
|
},
|
|
{
|
|
"topic": "memory-pressure-management",
|
|
"name": "Memory Pressure Management",
|
|
"area": "memory",
|
|
"tracks": {
|
|
"cloud": {
|
|
"applicable": true,
|
|
"question_count": 18
|
|
},
|
|
"edge": {
|
|
"applicable": true,
|
|
"question_count": 22
|
|
},
|
|
"mobile": {
|
|
"applicable": true,
|
|
"question_count": 18
|
|
},
|
|
"tinyml": {
|
|
"applicable": false,
|
|
"question_count": 0,
|
|
"reason": "MCU memory is statically allocated at compile time; no dynamic pressure"
|
|
}
|
|
}
|
|
},
|
|
{
|
|
"topic": "mixed-precision-training",
|
|
"name": "Mixed-Precision Training & Inference",
|
|
"area": "precision",
|
|
"tracks": {
|
|
"cloud": {
|
|
"applicable": true,
|
|
"question_count": 16
|
|
},
|
|
"edge": {
|
|
"applicable": true,
|
|
"question_count": 21
|
|
},
|
|
"mobile": {
|
|
"applicable": true,
|
|
"question_count": 21
|
|
},
|
|
"tinyml": {
|
|
"applicable": true,
|
|
"question_count": 0,
|
|
"reason": "INT8 weights with INT16 activations and per-layer precision selection is the defining optimization for TinyML deployment on MCUs"
|
|
}
|
|
}
|
|
},
|
|
{
|
|
"topic": "mixture-of-experts",
|
|
"name": "Mixture of Experts",
|
|
"area": "architecture",
|
|
"tracks": {
|
|
"cloud": {
|
|
"applicable": true,
|
|
"question_count": 22
|
|
},
|
|
"edge": {
|
|
"applicable": false,
|
|
"question_count": 2,
|
|
"reason": "MoE routing requires >10 GB memory; exceeds edge device capacity"
|
|
},
|
|
"mobile": {
|
|
"applicable": false,
|
|
"question_count": 0,
|
|
"reason": "Expert routing overhead exceeds mobile memory/compute budget"
|
|
},
|
|
"tinyml": {
|
|
"applicable": false,
|
|
"question_count": 0,
|
|
"reason": "MoE models are 10-100 GB; MCUs have 256 KB-8 MB SRAM"
|
|
}
|
|
}
|
|
},
|
|
{
|
|
"topic": "mlops-lifecycle",
|
|
"name": "MLOps Lifecycle",
|
|
"area": "deployment",
|
|
"tracks": {
|
|
"cloud": {
|
|
"applicable": true,
|
|
"question_count": 109
|
|
},
|
|
"edge": {
|
|
"applicable": true,
|
|
"question_count": 28
|
|
},
|
|
"mobile": {
|
|
"applicable": true,
|
|
"question_count": 20
|
|
},
|
|
"tinyml": {
|
|
"applicable": false,
|
|
"question_count": 2,
|
|
"reason": "MCU model deployment is firmware flashing, not MLOps pipelines"
|
|
}
|
|
}
|
|
},
|
|
{
|
|
"topic": "model-format-conversion",
|
|
"name": "Model Format & Runtime Conversion",
|
|
"area": "deployment",
|
|
"tracks": {
|
|
"cloud": {
|
|
"applicable": false,
|
|
"question_count": 2,
|
|
"reason": "Cloud runs native PyTorch/JAX; format conversion rarely needed"
|
|
},
|
|
"edge": {
|
|
"applicable": true,
|
|
"question_count": 21
|
|
},
|
|
"mobile": {
|
|
"applicable": true,
|
|
"question_count": 22
|
|
},
|
|
"tinyml": {
|
|
"applicable": true,
|
|
"question_count": 22
|
|
}
|
|
}
|
|
},
|
|
{
|
|
"topic": "model-serving-infrastructure",
|
|
"name": "Model Serving Infrastructure",
|
|
"area": "deployment",
|
|
"tracks": {
|
|
"cloud": {
|
|
"applicable": true,
|
|
"question_count": 206
|
|
},
|
|
"edge": {
|
|
"applicable": true,
|
|
"question_count": 38
|
|
},
|
|
"mobile": {
|
|
"applicable": true,
|
|
"question_count": 55
|
|
},
|
|
"tinyml": {
|
|
"applicable": false,
|
|
"question_count": 1,
|
|
"reason": "MCUs run bare-metal inference; no serving infrastructure"
|
|
}
|
|
}
|
|
},
|
|
{
|
|
"topic": "model-size-estimation",
|
|
"name": "Model Size Estimation",
|
|
"area": "architecture",
|
|
"tracks": {
|
|
"cloud": {
|
|
"applicable": true,
|
|
"question_count": 14
|
|
},
|
|
"edge": {
|
|
"applicable": true,
|
|
"question_count": 20
|
|
},
|
|
"mobile": {
|
|
"applicable": true,
|
|
"question_count": 21
|
|
},
|
|
"tinyml": {
|
|
"applicable": true,
|
|
"question_count": 21
|
|
}
|
|
}
|
|
},
|
|
{
|
|
"topic": "model-tensor-parallelism",
|
|
"name": "Model & Tensor Parallelism",
|
|
"area": "parallelism",
|
|
"tracks": {
|
|
"cloud": {
|
|
"applicable": true,
|
|
"question_count": 24
|
|
},
|
|
"edge": {
|
|
"applicable": false,
|
|
"question_count": 0,
|
|
"reason": "TP requires NVLink-class interconnect; edge has no inter-device fabric"
|
|
},
|
|
"mobile": {
|
|
"applicable": false,
|
|
"question_count": 0,
|
|
"reason": "Single-SoC mobile devices can't split tensors across chips"
|
|
},
|
|
"tinyml": {
|
|
"applicable": false,
|
|
"question_count": 0,
|
|
"reason": "MCUs have one core; no parallelism substrate"
|
|
}
|
|
}
|
|
},
|
|
{
|
|
"topic": "monitoring-observability",
|
|
"name": "Monitoring & Observability",
|
|
"area": "reliability",
|
|
"tracks": {
|
|
"cloud": {
|
|
"applicable": true,
|
|
"question_count": 9
|
|
},
|
|
"edge": {
|
|
"applicable": true,
|
|
"question_count": 29
|
|
},
|
|
"mobile": {
|
|
"applicable": true,
|
|
"question_count": 28
|
|
},
|
|
"tinyml": {
|
|
"applicable": true,
|
|
"question_count": 5
|
|
}
|
|
}
|
|
},
|
|
{
|
|
"topic": "network-bandwidth-bottlenecks",
|
|
"name": "Network Bandwidth Bottlenecks",
|
|
"area": "networking",
|
|
"tracks": {
|
|
"cloud": {
|
|
"applicable": true,
|
|
"question_count": 21
|
|
},
|
|
"edge": {
|
|
"applicable": false,
|
|
"question_count": 2,
|
|
"reason": "Edge inference is local; network bandwidth rarely bottlenecks inference"
|
|
},
|
|
"mobile": {
|
|
"applicable": true,
|
|
"question_count": 4
|
|
},
|
|
"tinyml": {
|
|
"applicable": false,
|
|
"question_count": 1,
|
|
"reason": "MCUs process locally; BLE/UART bandwidth is for telemetry, not inference"
|
|
}
|
|
}
|
|
},
|
|
{
|
|
"topic": "neural-architecture-search",
|
|
"name": "Neural Architecture Search",
|
|
"area": "architecture",
|
|
"tracks": {
|
|
"cloud": {
|
|
"applicable": true,
|
|
"question_count": 15
|
|
},
|
|
"edge": {
|
|
"applicable": true,
|
|
"question_count": 26
|
|
},
|
|
"mobile": {
|
|
"applicable": true,
|
|
"question_count": 23
|
|
},
|
|
"tinyml": {
|
|
"applicable": true,
|
|
"question_count": 22
|
|
}
|
|
}
|
|
},
|
|
{
|
|
"topic": "operator-scheduling",
|
|
"name": "Operator Scheduling",
|
|
"area": "optimization",
|
|
"tracks": {
|
|
"cloud": {
|
|
"applicable": true,
|
|
"question_count": 12
|
|
},
|
|
"edge": {
|
|
"applicable": true,
|
|
"question_count": 24
|
|
},
|
|
"mobile": {
|
|
"applicable": true,
|
|
"question_count": 21
|
|
},
|
|
"tinyml": {
|
|
"applicable": true,
|
|
"question_count": 23
|
|
}
|
|
}
|
|
},
|
|
{
|
|
"topic": "ota-firmware-updates",
|
|
"name": "OTA & Firmware Updates",
|
|
"area": "deployment",
|
|
"tracks": {
|
|
"cloud": {
|
|
"applicable": false,
|
|
"question_count": 0,
|
|
"reason": "Cloud uses container registries, not OTA firmware; different update model"
|
|
},
|
|
"edge": {
|
|
"applicable": true,
|
|
"question_count": 41
|
|
},
|
|
"mobile": {
|
|
"applicable": false,
|
|
"question_count": 0,
|
|
"reason": "Mobile uses app store updates, not firmware OTA"
|
|
},
|
|
"tinyml": {
|
|
"applicable": true,
|
|
"question_count": 45
|
|
}
|
|
}
|
|
},
|
|
{
|
|
"topic": "pipeline-parallelism",
|
|
"name": "Pipeline Parallelism",
|
|
"area": "parallelism",
|
|
"tracks": {
|
|
"cloud": {
|
|
"applicable": true,
|
|
"question_count": 21
|
|
},
|
|
"edge": {
|
|
"applicable": false,
|
|
"question_count": 1,
|
|
"reason": "PP requires multiple accelerators in sequence; edge has one"
|
|
},
|
|
"mobile": {
|
|
"applicable": false,
|
|
"question_count": 0,
|
|
"reason": "Single-SoC; no pipeline stages possible"
|
|
},
|
|
"tinyml": {
|
|
"applicable": false,
|
|
"question_count": 0,
|
|
"reason": "Single MCU; pipeline parallelism is meaningless"
|
|
}
|
|
}
|
|
},
|
|
{
|
|
"topic": "power-budgeting",
|
|
"name": "Power Budgeting",
|
|
"area": "power",
|
|
"tracks": {
|
|
"cloud": {
|
|
"applicable": true,
|
|
"question_count": 85
|
|
},
|
|
"edge": {
|
|
"applicable": true,
|
|
"question_count": 42
|
|
},
|
|
"mobile": {
|
|
"applicable": true,
|
|
"question_count": 50
|
|
},
|
|
"tinyml": {
|
|
"applicable": true,
|
|
"question_count": 25
|
|
}
|
|
}
|
|
},
|
|
{
|
|
"topic": "profiling-bottleneck-analysis",
|
|
"name": "Profiling & Bottleneck Analysis",
|
|
"area": "latency",
|
|
"tracks": {
|
|
"cloud": {
|
|
"applicable": true,
|
|
"question_count": 12
|
|
},
|
|
"edge": {
|
|
"applicable": true,
|
|
"question_count": 23
|
|
},
|
|
"mobile": {
|
|
"applicable": true,
|
|
"question_count": 24
|
|
},
|
|
"tinyml": {
|
|
"applicable": true,
|
|
"question_count": 21
|
|
}
|
|
}
|
|
},
|
|
{
|
|
"topic": "pruning-sparsity",
|
|
"name": "Pruning & Sparsity",
|
|
"area": "optimization",
|
|
"tracks": {
|
|
"cloud": {
|
|
"applicable": true,
|
|
"question_count": 192
|
|
},
|
|
"edge": {
|
|
"applicable": true,
|
|
"question_count": 32
|
|
},
|
|
"mobile": {
|
|
"applicable": true,
|
|
"question_count": 25
|
|
},
|
|
"tinyml": {
|
|
"applicable": true,
|
|
"question_count": 28
|
|
}
|
|
}
|
|
},
|
|
{
|
|
"topic": "quantization-fundamentals",
|
|
"name": "Quantization Fundamentals",
|
|
"area": "precision",
|
|
"tracks": {
|
|
"cloud": {
|
|
"applicable": true,
|
|
"question_count": 58
|
|
},
|
|
"edge": {
|
|
"applicable": true,
|
|
"question_count": 36
|
|
},
|
|
"mobile": {
|
|
"applicable": true,
|
|
"question_count": 48
|
|
},
|
|
"tinyml": {
|
|
"applicable": true,
|
|
"question_count": 23
|
|
}
|
|
}
|
|
},
|
|
{
|
|
"topic": "queueing-theory",
|
|
"name": "Queueing Theory",
|
|
"area": "latency",
|
|
"tracks": {
|
|
"cloud": {
|
|
"applicable": true,
|
|
"question_count": 18
|
|
},
|
|
"edge": {
|
|
"applicable": true,
|
|
"question_count": 3
|
|
},
|
|
"mobile": {
|
|
"applicable": false,
|
|
"question_count": 1,
|
|
"reason": "Mobile inference is synchronous single-request; no queue"
|
|
},
|
|
"tinyml": {
|
|
"applicable": false,
|
|
"question_count": 2,
|
|
"reason": "MCU processes one sensor reading at a time; no queueing"
|
|
}
|
|
}
|
|
},
|
|
{
|
|
"topic": "rdma-transport",
|
|
"name": "RDMA & High-Performance Transport",
|
|
"area": "networking",
|
|
"tracks": {
|
|
"cloud": {
|
|
"applicable": true,
|
|
"question_count": 22
|
|
},
|
|
"edge": {
|
|
"applicable": false,
|
|
"question_count": 2,
|
|
"reason": "RDMA requires InfiniBand/RoCE NICs; edge uses Ethernet/WiFi"
|
|
},
|
|
"mobile": {
|
|
"applicable": false,
|
|
"question_count": 1,
|
|
"reason": "No RDMA-capable hardware on mobile SoCs"
|
|
},
|
|
"tinyml": {
|
|
"applicable": false,
|
|
"question_count": 0,
|
|
"reason": "MCUs use SPI/UART/BLE; no network stack for RDMA"
|
|
}
|
|
}
|
|
},
|
|
{
|
|
"topic": "real-time-deadlines",
|
|
"name": "Real-Time Deadlines",
|
|
"area": "latency",
|
|
"tracks": {
|
|
"cloud": {
|
|
"applicable": false,
|
|
"question_count": 0,
|
|
"reason": "Cloud optimizes throughput/cost, not hard real-time guarantees"
|
|
},
|
|
"edge": {
|
|
"applicable": true,
|
|
"question_count": 171
|
|
},
|
|
"mobile": {
|
|
"applicable": true,
|
|
"question_count": 95
|
|
},
|
|
"tinyml": {
|
|
"applicable": true,
|
|
"question_count": 43
|
|
}
|
|
}
|
|
},
|
|
{
|
|
"topic": "responsible-ai",
|
|
"name": "Responsible AI & Governance",
|
|
"area": "cross-cutting",
|
|
"tracks": {
|
|
"cloud": {
|
|
"applicable": true,
|
|
"question_count": 11
|
|
},
|
|
"edge": {
|
|
"applicable": true,
|
|
"question_count": 22
|
|
},
|
|
"mobile": {
|
|
"applicable": true,
|
|
"question_count": 23
|
|
},
|
|
"tinyml": {
|
|
"applicable": true,
|
|
"question_count": 21
|
|
}
|
|
}
|
|
},
|
|
{
|
|
"topic": "roofline-analysis",
|
|
"name": "Roofline Analysis",
|
|
"area": "compute",
|
|
"tracks": {
|
|
"cloud": {
|
|
"applicable": true,
|
|
"question_count": 140
|
|
},
|
|
"edge": {
|
|
"applicable": true,
|
|
"question_count": 49
|
|
},
|
|
"mobile": {
|
|
"applicable": true,
|
|
"question_count": 55
|
|
},
|
|
"tinyml": {
|
|
"applicable": true,
|
|
"question_count": 20
|
|
}
|
|
}
|
|
},
|
|
{
|
|
"topic": "safety-certification",
|
|
"name": "Safety & Certification",
|
|
"area": "reliability",
|
|
"tracks": {
|
|
"cloud": {
|
|
"applicable": false,
|
|
"question_count": 0,
|
|
"reason": "Cloud ML serves best-effort; no DO-178C/IEC 61508 certification required"
|
|
},
|
|
"edge": {
|
|
"applicable": true,
|
|
"question_count": 29
|
|
},
|
|
"mobile": {
|
|
"applicable": true,
|
|
"question_count": 21
|
|
},
|
|
"tinyml": {
|
|
"applicable": true,
|
|
"question_count": 31
|
|
}
|
|
}
|
|
},
|
|
{
|
|
"topic": "scheduling-resource-management",
|
|
"name": "Scheduling & Resource Management",
|
|
"area": "parallelism",
|
|
"tracks": {
|
|
"cloud": {
|
|
"applicable": true,
|
|
"question_count": 25
|
|
},
|
|
"edge": {
|
|
"applicable": false,
|
|
"question_count": 0,
|
|
"reason": "No multi-tenant GPU cluster on edge; single workload per device"
|
|
},
|
|
"mobile": {
|
|
"applicable": false,
|
|
"question_count": 0,
|
|
"reason": "Mobile OS manages resources; no ML-specific scheduler"
|
|
},
|
|
"tinyml": {
|
|
"applicable": false,
|
|
"question_count": 0,
|
|
"reason": "Bare-metal MCU runs one model; no scheduling needed"
|
|
}
|
|
}
|
|
},
|
|
{
|
|
"topic": "speculative-decoding",
|
|
"name": "Speculative Decoding",
|
|
"area": "optimization",
|
|
"tracks": {
|
|
"cloud": {
|
|
"applicable": true,
|
|
"question_count": 14
|
|
},
|
|
"edge": {
|
|
"applicable": true,
|
|
"question_count": 4
|
|
},
|
|
"mobile": {
|
|
"applicable": true,
|
|
"question_count": 5
|
|
},
|
|
"tinyml": {
|
|
"applicable": false,
|
|
"question_count": 0,
|
|
"reason": "MCUs can't run even one LLM; speculative decoding needs two"
|
|
}
|
|
}
|
|
},
|
|
{
|
|
"topic": "storage-format-selection",
|
|
"name": "Storage Format & Selection",
|
|
"area": "data",
|
|
"tracks": {
|
|
"cloud": {
|
|
"applicable": true,
|
|
"question_count": 14
|
|
},
|
|
"edge": {
|
|
"applicable": true,
|
|
"question_count": 20
|
|
},
|
|
"mobile": {
|
|
"applicable": false,
|
|
"question_count": 0,
|
|
"reason": "Mobile uses CoreML/TFLite; format is fixed by platform"
|
|
},
|
|
"tinyml": {
|
|
"applicable": false,
|
|
"question_count": 0,
|
|
"reason": "MCUs use flatbuffer TFLite Micro; no format choice"
|
|
}
|
|
}
|
|
},
|
|
{
|
|
"topic": "streaming-ingestion",
|
|
"name": "Streaming & Real-Time Ingestion",
|
|
"area": "data",
|
|
"tracks": {
|
|
"cloud": {
|
|
"applicable": true,
|
|
"question_count": 16
|
|
},
|
|
"edge": {
|
|
"applicable": true,
|
|
"question_count": 18
|
|
},
|
|
"mobile": {
|
|
"applicable": true,
|
|
"question_count": 21
|
|
},
|
|
"tinyml": {
|
|
"applicable": true,
|
|
"question_count": 22
|
|
}
|
|
}
|
|
},
|
|
{
|
|
"topic": "systolic-dataflow",
|
|
"name": "Systolic Arrays & Dataflow",
|
|
"area": "compute",
|
|
"tracks": {
|
|
"cloud": {
|
|
"applicable": true,
|
|
"question_count": 7
|
|
},
|
|
"edge": {
|
|
"applicable": true,
|
|
"question_count": 19
|
|
},
|
|
"mobile": {
|
|
"applicable": true,
|
|
"question_count": 0,
|
|
"reason": "Mobile NPUs (Hexagon HTP, Apple ANE) use dataflow execution; operator shapes determine 10x NPU utilization differences"
|
|
},
|
|
"tinyml": {
|
|
"applicable": true,
|
|
"question_count": 0,
|
|
"reason": "TinyML-class NPUs (Ethos-U55, Syntiant NDP120) use dataflow architectures; understanding weight-stationary vs output-stationary is essential"
|
|
}
|
|
}
|
|
},
|
|
{
|
|
"topic": "tail-latency",
|
|
"name": "Tail Latency & SLAs",
|
|
"area": "latency",
|
|
"tracks": {
|
|
"cloud": {
|
|
"applicable": true,
|
|
"question_count": 17
|
|
},
|
|
"edge": {
|
|
"applicable": true,
|
|
"question_count": 3
|
|
},
|
|
"mobile": {
|
|
"applicable": false,
|
|
"question_count": 2,
|
|
"reason": "Mobile inference is single-request; tail latency is a fleet concept"
|
|
},
|
|
"tinyml": {
|
|
"applicable": false,
|
|
"question_count": 2,
|
|
"reason": "MCU inference is deterministic (~ms); no tail latency distribution"
|
|
}
|
|
}
|
|
},
|
|
{
|
|
"topic": "tco-cost-modeling",
|
|
"name": "TCO & Cost Modeling",
|
|
"area": "cross-cutting",
|
|
"tracks": {
|
|
"cloud": {
|
|
"applicable": true,
|
|
"question_count": 32
|
|
},
|
|
"edge": {
|
|
"applicable": true,
|
|
"question_count": 3
|
|
},
|
|
"mobile": {
|
|
"applicable": true,
|
|
"question_count": 5
|
|
},
|
|
"tinyml": {
|
|
"applicable": true,
|
|
"question_count": 5
|
|
}
|
|
}
|
|
},
|
|
{
|
|
"topic": "tensor-arena-planning",
|
|
"name": "Tensor Arena Planning",
|
|
"area": "memory",
|
|
"tracks": {
|
|
"cloud": {
|
|
"applicable": true,
|
|
"question_count": 6
|
|
},
|
|
"edge": {
|
|
"applicable": true,
|
|
"question_count": 5
|
|
},
|
|
"mobile": {
|
|
"applicable": false,
|
|
"question_count": 1,
|
|
"reason": "Mobile frameworks manage memory automatically; no manual arena planning"
|
|
},
|
|
"tinyml": {
|
|
"applicable": true,
|
|
"question_count": 21
|
|
}
|
|
}
|
|
},
|
|
{
|
|
"topic": "thermal-management",
|
|
"name": "Thermal Management",
|
|
"area": "power",
|
|
"tracks": {
|
|
"cloud": {
|
|
"applicable": true,
|
|
"question_count": 13
|
|
},
|
|
"edge": {
|
|
"applicable": true,
|
|
"question_count": 28
|
|
},
|
|
"mobile": {
|
|
"applicable": true,
|
|
"question_count": 20
|
|
},
|
|
"tinyml": {
|
|
"applicable": false,
|
|
"question_count": 2,
|
|
"reason": "MCUs draw <0.5W; thermal throttling doesn't occur"
|
|
}
|
|
}
|
|
},
|
|
{
|
|
"topic": "transformer-systems-cost",
|
|
"name": "Transformer Systems Cost",
|
|
"area": "architecture",
|
|
"tracks": {
|
|
"cloud": {
|
|
"applicable": true,
|
|
"question_count": 158
|
|
},
|
|
"edge": {
|
|
"applicable": true,
|
|
"question_count": 38
|
|
},
|
|
"mobile": {
|
|
"applicable": true,
|
|
"question_count": 35
|
|
},
|
|
"tinyml": {
|
|
"applicable": true,
|
|
"question_count": 7
|
|
}
|
|
}
|
|
},
|
|
{
|
|
"topic": "vram-budgeting",
|
|
"name": "VRAM Budgeting",
|
|
"area": "memory",
|
|
"tracks": {
|
|
"cloud": {
|
|
"applicable": true,
|
|
"question_count": 161
|
|
},
|
|
"edge": {
|
|
"applicable": true,
|
|
"question_count": 42
|
|
},
|
|
"mobile": {
|
|
"applicable": true,
|
|
"question_count": 21
|
|
},
|
|
"tinyml": {
|
|
"applicable": true,
|
|
"question_count": 4
|
|
}
|
|
}
|
|
}
|
|
]
|
|
} |